[clang] [CGBuiltin] Use freeze instruction to create an undef value instead of zero (PR #86967)

via cfe-commits cfe-commits at lists.llvm.org
Thu Mar 28 09:59:06 PDT 2024


https://github.com/AtariDreams updated https://github.com/llvm/llvm-project/pull/86967

>From 2cbc890e5d2296105cf9f3eb350181e1c9f0ed2d Mon Sep 17 00:00:00 2001
From: Rose <gfunni234 at gmail.com>
Date: Thu, 28 Mar 2024 12:12:57 -0400
Subject: [PATCH] [CGBuiltin] Use freeze instruction to create an undef value
 instead of zero

---
 clang/lib/CodeGen/CGBuiltin.cpp              |     8 +-
 clang/test/CodeGen/X86/avx-builtins.c        |  9070 +++++++++-
 clang/test/CodeGen/X86/avx2-builtins.c       |   507 +-
 clang/test/CodeGen/X86/avx512f-builtins.c    |  7314 +++-----
 clang/test/CodeGen/X86/avx512fp16-builtins.c | 15956 ++++++++++++++---
 clang/test/CodeGen/X86/sse-builtins.c        |   496 +-
 clang/test/CodeGen/X86/sse2-builtins.c       |  6537 ++++++-
 7 files changed, 30128 insertions(+), 9760 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5ab5917c0c8da7..179c0cf3cc905a 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14305,11 +14305,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_undef256:
   case X86::BI__builtin_ia32_undef512:
     // The x86 definition of "undef" is not the same as the LLVM definition
-    // (PR32176). We leave optimizing away an unnecessary zero constant to the
-    // IR optimizer and backend.
-    // TODO: If we had a "freeze" IR instruction to generate a fixed undef
-    // value, we should use that here instead of a zero.
-    return llvm::Constant::getNullValue(ConvertType(E->getType()));
+    // (PR32176). Use a "freeze" IR instruction to generate a fixed undef
+    // value.
+    return Builder.CreateFreeze(UndefValue::get(ConvertType(E->getType())));
   case X86::BI__builtin_ia32_vec_init_v8qi:
   case X86::BI__builtin_ia32_vec_init_v4hi:
   case X86::BI__builtin_ia32_vec_init_v2si:
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 4bf1213d9fca97..bd60a5c2702d1d 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
@@ -9,2086 +10,9453 @@
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
 
+//
+// X86-LABEL: define void @test_mm256_add_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META3]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META3]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META3]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META3]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META3]]
+// X86-NEXT:    [[ADD_I:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <4 x double> [[ADD_I]], ptr [[TMP]], align 32, !alias.scope [[META3]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META3]]
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META3]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_add_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_add_pd
-  // CHECK: fadd <4 x double>
   return _mm256_add_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_add_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META6]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META6]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META6]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META6]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META6]]
+// X86-NEXT:    [[ADD_I:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <8 x float> [[ADD_I]], ptr [[TMP]], align 32, !alias.scope [[META6]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META6]]
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META6]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_add_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_add_ps
-  // CHECK: fadd <8 x float>
   return _mm256_add_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_addsub_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META9]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META9]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META9]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META9]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META9]]
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META9]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META9]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META9]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_addsub_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_addsub_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_addsub_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_addsub_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META12]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META12]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META12]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META12]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META12]]
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META12]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META12]]
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META12]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_addsub_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_addsub_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_addsub_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_and_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META15]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META15]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META15]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META15]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META15]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+// X86-NEXT:    [[AND_I:%.*]] = and <4 x i64> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[AND_I]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META15]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META15]]
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META15]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_and_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_and_pd
-  // CHECK: and <4 x i64>
   return _mm256_and_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_and_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META18]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META18]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META18]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META18]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META18]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+// X86-NEXT:    [[AND_I:%.*]] = and <8 x i32> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[AND_I]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META18]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META18]]
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META18]]
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_and_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_and_ps
-  // CHECK: and <8 x i32>
   return _mm256_and_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_andnot_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META21]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META21]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META21]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META21]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[NOT_I:%.*]] = xor <4 x i64> [[TMP3]], <i64 -1, i64 -1, i64 -1, i64 -1>
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META21]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+// X86-NEXT:    [[AND_I:%.*]] = and <4 x i64> [[NOT_I]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[AND_I]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META21]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META21]]
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META21]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_andnot_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_andnot_pd
-  // CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
-  // CHECK: and <4 x i64>
   return _mm256_andnot_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_andnot_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META24]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META24]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META24]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META24]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[NOT_I:%.*]] = xor <8 x i32> [[TMP3]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META24]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+// X86-NEXT:    [[AND_I:%.*]] = and <8 x i32> [[NOT_I]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[AND_I]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META24]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META24]]
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META24]]
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_andnot_ps
-  // CHECK: xor <8 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-  // CHECK: and <8 x i32>
   return _mm256_andnot_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_blend_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+// X86-NEXT:    store <4 x double> [[BLEND]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_blend_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_blend_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   return _mm256_blend_pd(A, B, 0x05);
 }
 
+//
+// X86-LABEL: define void @test_mm256_blend_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[BLEND:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+// X86-NEXT:    store <8 x float> [[BLEND]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_blend_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_blend_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
   return _mm256_blend_ps(A, B, 0x35);
 }
 
+//
+// X86-LABEL: define void @test_mm256_blendv_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[V1:%.*]], <4 x double> noundef [[V2:%.*]], <4 x double> noundef [[V3:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[V1_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[V2_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[V3_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[V1]], ptr [[V1_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[V2]], ptr [[V2_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[V3]], ptr [[V3_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[V1_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[V2_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[V3_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META27]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META27]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META27]]
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[__C_ADDR_I]], align 32, !noalias [[META27]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META27]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META27]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[__C_ADDR_I]], align 32, !noalias [[META27]]
+// X86-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x double> [[TMP5]])
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META27]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META27]]
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META27]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) {
-  // CHECK-LABEL: test_mm256_blendv_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_blendv_pd(V1, V2, V3);
 }
 
+//
+// X86-LABEL: define void @test_mm256_blendv_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[V1:%.*]], <8 x float> noundef [[V2:%.*]], <8 x float> noundef [[V3:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[V1_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[V2_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[V3_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[V1]], ptr [[V1_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[V2]], ptr [[V2_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[V3]], ptr [[V3_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[V1_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[V2_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[V3_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META30]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META30]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META30]]
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[__C_ADDR_I]], align 32, !noalias [[META30]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META30]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META30]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[__C_ADDR_I]], align 32, !noalias [[META30]]
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]])
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META30]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META30]]
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META30]]
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_blendv_ps(__m256 V1, __m256 V2, __m256 V3) {
-  // CHECK-LABEL: test_mm256_blendv_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_blendv_ps(V1, V2, V3);
 }
 
+//
+// X86-LABEL: define void @test_mm256_broadcast_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META33]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4, !noalias [[META33]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4, !noalias [[META33]]
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[TMP2]], align 1
+// X86-NEXT:    store <2 x double> [[TMP3]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE_I]], align 16, !noalias [[META33]]
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE_I]], align 16, !noalias [[META33]]
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[__B_I]], align 16, !noalias [[META33]]
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__B_I]], align 16, !noalias [[META33]]
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[__B_I]], align 16, !noalias [[META33]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META33]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META33]]
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META33]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_broadcast_pd(__m128d* A) {
-  // CHECK-LABEL: test_mm256_broadcast_pd
-  // CHECK: load <2 x double>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcast_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_broadcast_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META36]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4, !noalias [[META36]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4, !noalias [[META36]]
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[TMP2]], align 1
+// X86-NEXT:    store <4 x float> [[TMP3]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE_I]], align 16, !noalias [[META36]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[COERCE_I]], align 16, !noalias [[META36]]
+// X86-NEXT:    store <4 x float> [[TMP5]], ptr [[__B_I]], align 16, !noalias [[META36]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[__B_I]], align 16, !noalias [[META36]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[__B_I]], align 16, !noalias [[META36]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META36]]
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META36]]
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META36]]
+// X86-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP10:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_broadcast_ps(__m128* A) {
-  // CHECK-LABEL: test_mm256_broadcast_ps
-  // CHECK: load <4 x float>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
   return _mm256_broadcast_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_broadcast_sd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__D_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META39:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META39]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4, !noalias [[META39]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4, !noalias [[META39]]
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1
+// X86-NEXT:    store double [[TMP2]], ptr [[__D_I]], align 8, !noalias [[META39]]
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__D_I]], align 8, !noalias [[META39]]
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x double> poison, double [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load double, ptr [[__D_I]], align 8, !noalias [[META39]]
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x double> [[VECINIT_I]], double [[TMP4]], i32 1
+// X86-NEXT:    [[TMP5:%.*]] = load double, ptr [[__D_I]], align 8, !noalias [[META39]]
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x double> [[VECINIT2_I]], double [[TMP5]], i32 2
+// X86-NEXT:    [[TMP6:%.*]] = load double, ptr [[__D_I]], align 8, !noalias [[META39]]
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x double> [[VECINIT3_I]], double [[TMP6]], i32 3
+// X86-NEXT:    store <4 x double> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META39]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META39]]
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META39]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META39]]
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META39]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_broadcast_sd(double* A) {
-  // CHECK-LABEL: test_mm256_broadcast_sd
-  // CHECK: load double, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <4 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
   return _mm256_broadcast_sd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_broadcast_ss(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__F_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1
+// X86-NEXT:    store float [[TMP2]], ptr [[__F_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[TMP4]], i32 1
+// X86-NEXT:    [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[TMP5]], i32 2
+// X86-NEXT:    [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <4 x float> [[VECINIT3_I]], float [[TMP6]], i32 3
+// X86-NEXT:    store <4 x float> [[VECINIT4_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP7]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP8]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP9]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP10]]
+//
 __m128 test_mm_broadcast_ss(float* A) {
-  // CHECK-LABEL: test_mm_broadcast_ss
-  // CHECK: load float, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
   return _mm_broadcast_ss(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_broadcast_ss(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__F_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META42:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 1
+// X86-NEXT:    store float [[TMP2]], ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[TMP3:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP4]], i32 1
+// X86-NEXT:    [[TMP5:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP5]], i32 2
+// X86-NEXT:    [[TMP6:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP6]], i32 3
+// X86-NEXT:    [[TMP7:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP7]], i32 4
+// X86-NEXT:    [[TMP8:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP8]], i32 5
+// X86-NEXT:    [[TMP9:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP9]], i32 6
+// X86-NEXT:    [[TMP10:%.*]] = load float, ptr [[__F_I]], align 4, !noalias [[META42]]
+// X86-NEXT:    [[VECINIT8_I:%.*]] = insertelement <8 x float> [[VECINIT7_I]], float [[TMP10]], i32 7
+// X86-NEXT:    store <8 x float> [[VECINIT8_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META42]]
+// X86-NEXT:    [[TMP11:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META42]]
+// X86-NEXT:    store <8 x float> [[TMP11]], ptr [[TMP]], align 32, !alias.scope [[META42]]
+// X86-NEXT:    [[TMP12:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META42]]
+// X86-NEXT:    store <8 x float> [[TMP12]], ptr [[TMP]], align 32, !alias.scope [[META42]]
+// X86-NEXT:    [[TMP13:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP13]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP14:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP14]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_broadcast_ss(float* A) {
-  // CHECK-LABEL: test_mm256_broadcast_ss
-  // CHECK: load float, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <8 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
   return _mm256_broadcast_ss(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castpd_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META45:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META45]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META45]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META45]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x double> [[TMP1]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META45]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META45]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META45]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_castpd_ps(__m256d A) {
-  // CHECK-LABEL: test_mm256_castpd_ps
   return _mm256_castpd_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castpd_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META48:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META48]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META48]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META48]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x double> [[TMP1]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META48]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META48]]
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META48]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_castpd_si256(__m256d A) {
-  // CHECK-LABEL: test_mm256_castpd_si256
   return _mm256_castpd_si256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castpd128_pd256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META51:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META51]]
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META51]]
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16, !noalias [[META51]]
+// X86-NEXT:    [[TMP2:%.*]] = freeze <2 x double> poison
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META51]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META51]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META51]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_castpd128_pd256(__m128d A) {
-  // CHECK-LABEL: test_mm256_castpd128_pd256
-  // CHECK: [[A:%.*]] = freeze <2 x double> poison
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_castpd128_pd256(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_castpd256_pd128(
+// X86-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <2 x i32> <i32 0, i32 1>
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm256_castpd256_pd128(__m256d A) {
-  // CHECK-LABEL: test_mm256_castpd256_pd128
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
   return _mm256_castpd256_pd128(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castps_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META54:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META54]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META54]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META54]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <8 x float> [[TMP1]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META54]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META54]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META54]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_castps_pd(__m256 A) {
-  // CHECK-LABEL: test_mm256_castps_pd
   return _mm256_castps_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castps_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META57:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META57]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META57]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META57]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <8 x float> [[TMP1]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META57]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META57]]
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META57]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_castps_si256(__m256 A) {
-  // CHECK-LABEL: test_mm256_castps_si256
   return _mm256_castps_si256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castps128_ps256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META60:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META60]]
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META60]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16, !noalias [[META60]]
+// X86-NEXT:    [[TMP2:%.*]] = freeze <4 x float> poison
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META60]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META60]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META60]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_castps128_ps256(__m128 A) {
-  // CHECK-LABEL: test_mm256_castps128_ps256
-  // CHECK: [[A:%.*]] = freeze <4 x float> poison
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_castps128_ps256(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_castps256_ps128(
+// X86-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x float> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm256_castps256_ps128(__m256 A) {
-  // CHECK-LABEL: test_mm256_castps256_ps128
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_castps256_ps128(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castsi128_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META63:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META63]]
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META63]]
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16, !noalias [[META63]]
+// X86-NEXT:    [[TMP2:%.*]] = freeze <2 x i64> poison
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x i64> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META63]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META63]]
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META63]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_castsi128_si256(__m128i A) {
-  // CHECK-LABEL: test_mm256_castsi128_si256
-  // CHECK: [[A:%.*]] = freeze <2 x i64> poison
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_castsi128_si256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castsi256_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META66:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META66]]
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META66]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32, !noalias [[META66]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META66]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META66]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META66]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_castsi256_pd(__m256i A) {
-  // CHECK-LABEL: test_mm256_castsi256_pd
   return _mm256_castsi256_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_castsi256_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META69:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META69]]
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META69]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32, !noalias [[META69]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META69]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META69]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META69]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_castsi256_ps(__m256i A) {
-  // CHECK-LABEL: test_mm256_castsi256_ps
   return _mm256_castsi256_ps(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_castsi256_si128(
+// X86-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <2 x i32> <i32 0, i32 1>
+// X86-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 __m128i test_mm256_castsi256_si128(__m256i A) {
-  // CHECK-LABEL: test_mm256_castsi256_si128
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
   return _mm256_castsi256_si128(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_ceil_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[X:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> [[TMP0]], i32 2)
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_ceil_pd(__m256d x) {
-  // CHECK-LABEL: test_mm256_ceil_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
   return _mm256_ceil_pd(x);
 }
 
+//
+// X86-LABEL: define void @test_mm_ceil_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> [[TMP0]], i32 2)
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm_ceil_ps(__m256 x) {
-  // CHECK-LABEL: test_mm_ceil_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2)
   return _mm256_ceil_ps(x);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_eq_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oeq <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_eq_oq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_eq_oq
-  // CHECK: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_lt_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_lt_os(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_lt_os
-  // CHECK: fcmp olt <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_LT_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_le_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_le_os(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_le_os
-  // CHECK: fcmp ole <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_LE_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_unord_q(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_unord_q(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_unord_q
-  // CHECK: fcmp uno <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_UNORD_Q);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_neq_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp une <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_neq_uq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_neq_uq
-  // CHECK: fcmp une <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NEQ_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_nlt_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uge <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_nlt_us(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_nlt_us
-  // CHECK: fcmp uge <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NLT_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_nle_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ugt <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_nle_us(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_nle_us
-  // CHECK: fcmp ugt <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NLE_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_ord_q(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ord <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_ord_q(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_ord_q
-  // CHECK: fcmp ord <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_ORD_Q);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_eq_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_eq_uq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_eq_uq
-  // CHECK: fcmp ueq <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_EQ_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_nge_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_nge_us(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_nge_us
-  // CHECK: fcmp ult <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NGE_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_ngt_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_ngt_us(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_ngt_us
-  // CHECK: fcmp ule <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NGT_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_false_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_false_oq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_false_oq
-  // CHECK: fcmp false <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_FALSE_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_neq_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_neq_oq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_neq_oq
-  // CHECK: fcmp one <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NEQ_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_ge_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_ge_os(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_ge_os
-  // CHECK: fcmp oge <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_GE_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_gt_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_gt_os(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_gt_os
-  // CHECK: fcmp ogt <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_GT_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_true_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_true_uq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_true_uq
-  // CHECK: fcmp true <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_TRUE_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_eq_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oeq <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_eq_os(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_eq_os
-  // CHECK: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_EQ_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_lt_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_lt_oq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_lt_oq
-  // CHECK: fcmp olt <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_le_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_le_oq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_le_oq
-  // CHECK: fcmp ole <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_unord_s(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_unord_s(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_unord_s
-  // CHECK: fcmp uno <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_UNORD_S);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_neq_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp une <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_neq_us(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_neq_us
-  // CHECK: fcmp une <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NEQ_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_nlt_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uge <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_nlt_uq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_nlt_uq
-  // CHECK: fcmp uge <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NLT_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_nle_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ugt <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_nle_uq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_nle_uq
-  // CHECK: fcmp ugt <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NLE_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_ord_s(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ord <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_ord_s(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_ord_s
-  // CHECK: fcmp ord <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_ORD_S);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_eq_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_eq_us(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_eq_us
-  // CHECK: fcmp ueq <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_EQ_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_nge_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_nge_uq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_nge_uq
-  // CHECK: fcmp ult <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NGE_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_ngt_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_ngt_uq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_ngt_uq
-  // CHECK: fcmp ule <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NGT_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_false_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_false_os(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_false_os
-  // CHECK: fcmp false <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_FALSE_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_neq_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_neq_os(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_neq_os
-  // CHECK: fcmp one <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_NEQ_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_ge_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_ge_oq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_ge_oq
-  // CHECK: fcmp oge <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_GE_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_gt_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_gt_oq(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_gt_oq
-  // CHECK: fcmp ogt <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_GT_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_pd_true_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <4 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cmp_pd_true_us(__m256d a, __m256d b) {
-  // CHECK-LABEL: test_mm256_cmp_pd_true_us
-  // CHECK: fcmp true <4 x double> %{{.*}}, %{{.*}}
   return _mm256_cmp_pd(a, b, _CMP_TRUE_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_eq_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oeq <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_eq_oq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_eq_oq
-  // CHECK: fcmp oeq <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_lt_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_lt_os(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_lt_os
-  // CHECK: fcmp olt <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_LT_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_le_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_le_os(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_le_os
-  // CHECK: fcmp ole <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_LE_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_unord_q(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uno <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_unord_q(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_unord_q
-  // CHECK: fcmp uno <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_UNORD_Q);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_neq_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp une <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_neq_uq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_neq_uq
-  // CHECK: fcmp une <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NEQ_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_nlt_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uge <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_nlt_us(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_nlt_us
-  // CHECK: fcmp uge <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NLT_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_nle_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ugt <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_nle_us(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_nle_us
-  // CHECK: fcmp ugt <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NLE_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_ord_q(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ord <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_ord_q(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_ord_q
-  // CHECK: fcmp ord <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_ORD_Q);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_eq_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_eq_uq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_eq_uq
-  // CHECK: fcmp ueq <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_EQ_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_nge_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_nge_us(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_nge_us
-  // CHECK: fcmp ult <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NGE_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_ngt_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_ngt_us(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_ngt_us
-  // CHECK: fcmp ule <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NGT_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_false_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_false_oq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_false_oq
-  // CHECK: fcmp false <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_FALSE_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_neq_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_neq_oq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_neq_oq
-  // CHECK: fcmp one <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NEQ_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_ge_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_ge_os(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_ge_os
-  // CHECK: fcmp oge <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_GE_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_gt_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_gt_os(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_gt_os
-  // CHECK: fcmp ogt <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_GT_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_true_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_true_uq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_true_uq
-  // CHECK: fcmp true <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_TRUE_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_eq_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oeq <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_eq_os(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_eq_os
-  // CHECK: fcmp oeq <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_EQ_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_lt_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp olt <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_lt_oq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_lt_oq
-  // CHECK: fcmp olt <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_le_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ole <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_le_oq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_le_oq
-  // CHECK: fcmp ole <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_unord_s(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uno <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_unord_s(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_unord_s
-  // CHECK: fcmp uno <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_UNORD_S);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_neq_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp une <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_neq_us(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_neq_us
-  // CHECK: fcmp une <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NEQ_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_nlt_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uge <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_nlt_uq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_nlt_uq
-  // CHECK: fcmp uge <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NLT_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_nle_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ugt <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_nle_uq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_nle_uq
-  // CHECK: fcmp ugt <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NLE_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_ord_s(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ord <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_ord_s(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_ord_s
-  // CHECK: fcmp ord <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_ORD_S);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_eq_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_eq_us(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_eq_us
-  // CHECK: fcmp ueq <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_EQ_US);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_nge_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_nge_uq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_nge_uq
-  // CHECK: fcmp ult <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NGE_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_ngt_uq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_ngt_uq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_ngt_uq
-  // CHECK: fcmp ule <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NGT_UQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_false_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_false_os(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_false_os
-  // CHECK: fcmp false <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_FALSE_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_neq_os(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_neq_os(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_neq_os
-  // CHECK: fcmp one <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_NEQ_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_ge_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_ge_oq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_ge_oq
-  // CHECK: fcmp oge <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_GE_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_gt_oq(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_gt_oq(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_gt_oq
-  // CHECK: fcmp ogt <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_GT_OQ);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cmp_ps_true_us(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <8 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cmp_ps_true_us(__m256 a, __m256 b) {
-  // CHECK-LABEL: test_mm256_cmp_ps_true_us
-  // CHECK: fcmp true <8 x float> %{{.*}}, %{{.*}}
   return _mm256_cmp_ps(a, b, _CMP_TRUE_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_eq_uq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_eq_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_eq_uq
-  // CHECK: fcmp ueq <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_EQ_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_nge_us(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_nge_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nge_us
-  // CHECK: fcmp ult <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NGE_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_ngt_us(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_ngt_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_ngt_us
-  // CHECK: fcmp ule <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NGT_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_false_oq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_false_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_false_oq
-  // CHECK: fcmp false <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_FALSE_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_neq_oq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_neq_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_neq_oq
-  // CHECK: fcmp one <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NEQ_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_ge_os(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_ge_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_ge_os
-  // CHECK: fcmp oge <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_GE_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_gt_os(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_gt_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_gt_os
-  // CHECK: fcmp ogt <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_GT_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_true_uq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_true_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_true_uq
-  // CHECK: fcmp true <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_TRUE_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_eq_os(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oeq <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_eq_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_eq_os
-  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_EQ_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_lt_oq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_lt_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_lt_oq
-  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_LT_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_le_oq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ole <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_le_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_le_oq
-  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_LE_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_unord_s(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uno <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_unord_s(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_unord_s
-  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_UNORD_S);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_neq_us(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp une <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_neq_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_neq_us
-  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NEQ_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_nlt_uq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uge <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_nlt_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nlt_uq
-  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NLT_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_nle_uq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ugt <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_nle_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nle_uq
-  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NLE_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_ord_s(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ord <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_ord_s(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_ord_s
-  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_ORD_S);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_eq_us(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_eq_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_eq_us
-  // CHECK: fcmp ueq <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_EQ_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_nge_uq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_nge_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nge_uq
-  // CHECK: fcmp ult <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NGE_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_ngt_uq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_ngt_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_ngt_uq
-  // CHECK: fcmp ule <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NGT_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_false_os(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_false_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_false_os
-  // CHECK: fcmp false <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_FALSE_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_neq_os(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_neq_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_neq_os
-  // CHECK: fcmp one <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NEQ_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_ge_oq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_ge_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_ge_oq
-  // CHECK: fcmp oge <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_GE_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_gt_oq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_gt_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_gt_oq
-  // CHECK: fcmp ogt <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_GT_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_true_us(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_true_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_true_us
-  // CHECK: fcmp true <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_TRUE_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_eq_uq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_eq_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_eq_uq
-  // CHECK: fcmp ueq <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_EQ_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_nge_us(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_nge_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nge_us
-  // CHECK: fcmp ult <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NGE_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_ngt_us(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_ngt_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_ngt_us
-  // CHECK: fcmp ule <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NGT_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_false_oq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_false_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_false_oq
-  // CHECK: fcmp false <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_FALSE_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_neq_oq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_neq_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_neq_oq
-  // CHECK: fcmp one <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NEQ_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_ge_os(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_ge_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_ge_os
-  // CHECK: fcmp oge <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_GE_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_gt_os(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_gt_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_gt_os
-  // CHECK: fcmp ogt <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_GT_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_true_uq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_true_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_true_uq
-  // CHECK: fcmp true <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_TRUE_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_eq_os(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_eq_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_eq_os
-  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_EQ_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_lt_oq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_lt_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_lt_oq
-  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_LT_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_le_oq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ole <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_le_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_le_oq
-  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_LE_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_unord_s(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_unord_s(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_unord_s
-  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_UNORD_S);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_neq_us(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp une <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_neq_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_neq_us
-  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NEQ_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_nlt_uq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uge <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_nlt_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nlt_uq
-  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NLT_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_nle_uq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_nle_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nle_uq
-  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NLE_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_ord_s(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ord <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_ord_s(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_ord_s
-  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_ORD_S);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_eq_us(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ueq <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_eq_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_eq_us
-  // CHECK: fcmp ueq <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_EQ_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_nge_uq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ult <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_nge_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nge_uq
-  // CHECK: fcmp ult <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NGE_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_ngt_uq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ule <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_ngt_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_ngt_uq
-  // CHECK: fcmp ule <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NGT_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_false_os(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp false <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_false_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_false_os
-  // CHECK: fcmp false <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_FALSE_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_neq_os(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp one <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_neq_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_neq_os
-  // CHECK: fcmp one <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NEQ_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_ge_oq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oge <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_ge_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_ge_oq
-  // CHECK: fcmp oge <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_GE_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_gt_oq(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ogt <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_gt_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_gt_oq
-  // CHECK: fcmp ogt <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_GT_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ps_true_us(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp true <4 x float> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cmp_ps_true_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_true_us
-  // CHECK: fcmp true <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_TRUE_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i8 13)
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128d test_mm_cmp_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmp_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 13)
   return _mm_cmp_sd(A, B, _CMP_GE_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_ss(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> [[TMP0]], <4 x float> [[TMP1]], i8 13)
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128 test_mm_cmp_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_cmp_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 13)
   return _mm_cmp_ss(A, B, _CMP_GE_OS);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cvtepi32_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META72:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META72]]
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META72]]
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16, !noalias [[META72]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[CONV_I:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[CONV_I]], ptr [[TMP]], align 32, !alias.scope [[META72]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META72]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META72]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cvtepi32_pd(__m128i A) {
-  // CHECK-LABEL: test_mm256_cvtepi32_pd
-  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x double>
   return _mm256_cvtepi32_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cvtepi32_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META75:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META75]]
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META75]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32, !noalias [[META75]]
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to <8 x i32>
+// X86-NEXT:    [[CONV_I:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[CONV_I]], ptr [[TMP]], align 32, !alias.scope [[META75]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META75]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META75]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_cvtepi32_ps(__m256i A) {
-  // CHECK-LABEL: test_mm256_cvtepi32_ps
-  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x float>
   return _mm256_cvtepi32_ps(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_cvtpd_epi32(
+// X86-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm256_cvtpd_epi32(__m256d A) {
-  // CHECK-LABEL: test_mm256_cvtpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %{{.*}})
   return _mm256_cvtpd_epi32(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_cvtpd_ps(
+// X86-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> [[TMP1]])
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm256_cvtpd_ps(__m256d A) {
-  // CHECK-LABEL: test_mm256_cvtpd_ps
-  // CHECK: call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %{{.*}})
   return _mm256_cvtpd_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cvtps_epi32(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META78:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META78]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META78]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META78]]
+// X86-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META78]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META78]]
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META78]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_cvtps_epi32(__m256 A) {
-  // CHECK-LABEL: test_mm256_cvtps_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %{{.*}})
   return _mm256_cvtps_epi32(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cvtps_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META81:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META81]]
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META81]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16, !noalias [[META81]]
+// X86-NEXT:    [[CONV_I:%.*]] = fpext <4 x float> [[TMP1]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[CONV_I]], ptr [[TMP]], align 32, !alias.scope [[META81]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META81]]
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META81]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_cvtps_pd(__m128 A) {
-  // CHECK-LABEL: test_mm256_cvtps_pd
-  // CHECK: fpext <4 x float> %{{.*}} to <4 x double>
   return _mm256_cvtps_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_cvttpd_epi32(
+// X86-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm256_cvttpd_epi32(__m256d A) {
-  // CHECK-LABEL: test_mm256_cvttpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %{{.*}})
   return _mm256_cvttpd_epi32(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_cvttps_epi32(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META84:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META84]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META84]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META84]]
+// X86-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META84]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META84]]
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META84]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_cvttps_epi32(__m256 A) {
-  // CHECK-LABEL: test_mm256_cvttps_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %{{.*}})
   return _mm256_cvttps_epi32(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_div_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META87:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META87]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META87]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META87]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META87]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META87]]
+// X86-NEXT:    [[DIV_I:%.*]] = fdiv <4 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <4 x double> [[DIV_I]], ptr [[TMP]], align 32, !alias.scope [[META87]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META87]]
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META87]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_div_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_div_pd
-  // CHECK: fdiv <4 x double>
   return _mm256_div_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_div_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META90:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META90]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META90]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META90]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META90]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META90]]
+// X86-NEXT:    [[DIV_I:%.*]] = fdiv <8 x float> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <8 x float> [[DIV_I]], ptr [[TMP]], align 32, !alias.scope [[META90]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META90]]
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META90]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_div_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_div_ps
-  // CHECK: fdiv <8 x float>
   return _mm256_div_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_dp_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> [[TMP0]], <8 x float> [[TMP1]], i8 7)
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_dp_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_dp_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> {{.*}}, <8 x float> {{.*}}, i8 7)
   return _mm256_dp_ps(A, B, 7);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_extract_epi8(
+// X86-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <32 x i8>
+// X86-NEXT:    [[TMP2:%.*]] = extractelement <32 x i8> [[TMP1]], i64 31
+// X86-NEXT:    [[CONV:%.*]] = zext i8 [[TMP2]] to i32
+// X86-NEXT:    ret i32 [[CONV]]
+//
 int test_mm256_extract_epi8(__m256i A) {
-  // CHECK-LABEL: test_mm256_extract_epi8
-  // CHECK: extractelement <32 x i8> %{{.*}}, {{i32|i64}} 31
-  // CHECK: zext i8 %{{.*}} to i32
   return _mm256_extract_epi8(A, 31);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_extract_epi16(
+// X86-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <16 x i16>
+// X86-NEXT:    [[TMP2:%.*]] = extractelement <16 x i16> [[TMP1]], i64 15
+// X86-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// X86-NEXT:    ret i32 [[CONV]]
+//
 int test_mm256_extract_epi16(__m256i A) {
-  // CHECK-LABEL: test_mm256_extract_epi16
-  // CHECK: extractelement <16 x i16> %{{.*}}, {{i32|i64}} 15
-  // CHECK: zext i16 %{{.*}} to i32
   return _mm256_extract_epi16(A, 15);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_extract_epi32(
+// X86-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+// X86-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i64 7
+// X86-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm256_extract_epi32(__m256i A) {
-  // CHECK-LABEL: test_mm256_extract_epi32
-  // CHECK: extractelement <8 x i32> %{{.*}}, {{i32|i64}} 7
   return _mm256_extract_epi32(A, 7);
 }
 
 #if __x86_64__
+//
 long long test_mm256_extract_epi64(__m256i A) {
-  // X64-LABEL: test_mm256_extract_epi64
-  // X64: extractelement <4 x i64> %{{.*}}, {{i32|i64}} 3
   return _mm256_extract_epi64(A, 3);
 }
 #endif
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_extractf128_pd(
+// X86-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[EXTRACT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+// X86-NEXT:    store <2 x double> [[EXTRACT]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128d test_mm256_extractf128_pd(__m256d A) {
-  // CHECK-LABEL: test_mm256_extractf128_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
   return _mm256_extractf128_pd(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_extractf128_ps(
+// X86-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    store <4 x float> [[EXTRACT]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128 test_mm256_extractf128_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_extractf128_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_ps(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm256_extractf128_si256(
+// X86-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+// X86-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[EXTRACT]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm256_extractf128_si256(__m256i A) {
-  // CHECK-LABEL: test_mm256_extractf128_si256
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm256_extractf128_si256(A, 1);
 }
 
+//
+// X86-LABEL: define void @test_mm256_floor_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[X:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> [[TMP0]], i32 1)
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_floor_pd(__m256d x) {
-  // CHECK-LABEL: test_mm256_floor_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
   return _mm256_floor_pd(x);
 }
 
+//
+// X86-LABEL: define void @test_mm_floor_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> [[TMP0]], i32 1)
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm_floor_ps(__m256 x) {
-  // CHECK-LABEL: test_mm_floor_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1)
   return _mm256_floor_ps(x);
 }
 
+//
+// X86-LABEL: define void @test_mm256_hadd_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META93:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META93]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META93]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META93]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META93]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META93]]
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META93]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META93]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META93]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_hadd_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hadd_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_hadd_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META96:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META96]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META96]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META96]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META96]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META96]]
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META96]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META96]]
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META96]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_hadd_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hadd_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_hsub_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META99:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META99]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META99]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META99]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META99]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META99]]
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META99]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META99]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META99]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_hsub_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_hsub_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_hsub_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META102:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META102]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META102]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META102]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META102]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META102]]
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META102]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META102]]
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META102]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_hsub_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_hsub_ps(A, B);
 }
 
+//
 __m256i test_mm256_insert_epi8(__m256i x, char b) {
-  // CHECK-LABEL: test_mm256_insert_epi8
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, {{i32|i64}} 14
   return _mm256_insert_epi8(x, b, 14);
 }
 
+//
+// X86-LABEL: define void @test_mm256_insert_epi16(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <4 x i64> noundef [[X:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x i64> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <16 x i16>
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP2]] to i16
+// X86-NEXT:    [[TMP3:%.*]] = insertelement <16 x i16> [[TMP1]], i16 [[CONV]], i64 4
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_insert_epi16(__m256i x, int b) {
-  // CHECK-LABEL: test_mm256_insert_epi16
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, {{i32|i64}} 4
   return _mm256_insert_epi16(x, b, 4);
 }
 
+//
+// X86-LABEL: define void @test_mm256_insert_epi32(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <4 x i64> noundef [[X:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x i64> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP2]], i64 5
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP3]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_insert_epi32(__m256i x, int b) {
-  // CHECK-LABEL: test_mm256_insert_epi32
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, {{i32|i64}} 5
   return _mm256_insert_epi32(x, b, 5);
 }
 
 #if __x86_64__
+//
 __m256i test_mm256_insert_epi64(__m256i x, long long b) {
-  // X64-LABEL: test_mm256_insert_epi64
-  // X64: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, {{i32|i64}} 2
   return _mm256_insert_epi64(x, b, 2);
 }
 #endif
 
+//
+// X86-LABEL: define void @test_mm256_insertf128_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[WIDEN:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    [[INSERT:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[WIDEN]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+// X86-NEXT:    store <4 x double> [[INSERT]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
-  // CHECK-LABEL: test_mm256_insertf128_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_insertf128_pd(A, B, 0);
 }
 
+//
+// X86-LABEL: define void @test_mm256_insertf128_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[WIDEN:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    [[INSERT:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[WIDEN]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// X86-NEXT:    store <8 x float> [[INSERT]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
-  // CHECK-LABEL: test_mm256_insertf128_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_insertf128_ps(A, B, 1);
 }
 
+//
+// X86-LABEL: define void @test_mm256_insertf128_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <4 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[WIDEN:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    [[INSERT:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[WIDEN]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[INSERT]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
-  // CHECK-LABEL: test_mm256_insertf128_si256
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
   return _mm256_insertf128_si256(A, B, 0);
 }
 
+//
+// X86-LABEL: define void @test_mm256_lddqu_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META105:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META105]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META105]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META105]]
+// X86-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <32 x i8> [[TMP2]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META105]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META105]]
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META105]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_lddqu_si256(__m256i* A) {
-  // CHECK-LABEL: test_mm256_lddqu_si256
-  // CHECK: call <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr %{{.*}})
   return _mm256_lddqu_si256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_load_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META108:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META108]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META108]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META108]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[TMP1]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META108]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META108]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META108]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_load_pd(double* A) {
-  // CHECK-LABEL: test_mm256_load_pd
-  // CHECK: load <4 x double>, ptr %{{.*}}, align 32
   return _mm256_load_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_load_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META111:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META111]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META111]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META111]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[TMP1]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META111]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META111]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META111]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_load_ps(float* A) {
-  // CHECK-LABEL: test_mm256_load_ps
-  // CHECK: load <8 x float>, ptr %{{.*}}, align 32
   return _mm256_load_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_load_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META114:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META114]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META114]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META114]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META114]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META114]]
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META114]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_load_si256(__m256i* A) {
-  // CHECK-LABEL: test_mm256_load_si256
-  // CHECK: load <4 x i64>, ptr %{{.*}}, align 32
   return _mm256_load_si256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_loadu_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META117:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META117]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META117]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META117]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[TMP1]], align 1
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META117]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META117]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META117]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_loadu_pd(double* A) {
-  // CHECK-LABEL: test_mm256_loadu_pd
-  // CHECK: load <4 x double>, ptr %{{.*}}, align 1{{$}}
   return _mm256_loadu_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_loadu_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META120:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META120]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META120]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META120]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[TMP1]], align 1
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META120]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META120]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META120]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_loadu_ps(float* A) {
-  // CHECK-LABEL: test_mm256_loadu_ps
-  // CHECK: load <8 x float>, ptr %{{.*}}, align 1{{$}}
   return _mm256_loadu_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_loadu_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META123:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META123]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META123]]
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META123]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 1
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META123]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META123]]
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META123]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_loadu_si256(__m256i* A) {
-  // CHECK-LABEL: test_mm256_loadu_si256
-  // CHECK: load <4 x i64>, ptr %{{.+}}, align 1{{$}}
   return _mm256_loadu_si256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_loadu2_m128(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I2:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__P_ADDR_I3:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR_I1:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_HI_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_LO_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[COERCE2_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META126:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META126]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__ADDR_HI_ADDR_I]], align 4, !noalias [[META126]]
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__ADDR_LO_ADDR_I]], align 4, !noalias [[META126]]
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__ADDR_HI_ADDR_I]], align 4, !noalias [[META126]]
+// X86-NEXT:    store ptr [[TMP2]], ptr [[__P_ADDR_I3]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I3]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[TMP3]], align 1
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL_I2]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I2]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE_I]], align 16, !noalias [[META126]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[COERCE_I]], align 16, !noalias [[META126]]
+// X86-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__ADDR_LO_ADDR_I]], align 4, !noalias [[META126]]
+// X86-NEXT:    store ptr [[TMP7]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[TMP8]], align 1
+// X86-NEXT:    store <4 x float> [[TMP9]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP10]], ptr [[COERCE2_I]], align 16, !noalias [[META126]]
+// X86-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[COERCE2_I]], align 16, !noalias [[META126]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META129:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I1]], align 4, !noalias [[META129]]
+// X86-NEXT:    store <4 x float> [[TMP6]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META129]]
+// X86-NEXT:    store <4 x float> [[TMP11]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META129]]
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x float>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META129]]
+// X86-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META129]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP_I]], align 32, !alias.scope [[META129]]
+// X86-NEXT:    [[TMP14:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !alias.scope [[META129]]
+// X86-NEXT:    store <8 x float> [[TMP14]], ptr [[TMP_I]], align 32, !alias.scope [[META129]]
+// X86-NEXT:    [[TMP15:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !noalias [[META126]]
+// X86-NEXT:    store <8 x float> [[TMP15]], ptr [[TMP]], align 32, !alias.scope [[META126]]
+// X86-NEXT:    [[TMP16:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META126]]
+// X86-NEXT:    store <8 x float> [[TMP16]], ptr [[TMP]], align 32, !alias.scope [[META126]]
+// X86-NEXT:    [[TMP17:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP17]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP18:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP18]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_loadu2_m128(float* A, float* B) {
-  // CHECK-LABEL: test_mm256_loadu2_m128
-  // CHECK: load <4 x float>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: load <4 x float>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_loadu2_m128(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_loadu2_m128d(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I2:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I3:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR_I1:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_HI_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_LO_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE2_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META132:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META132]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__ADDR_HI_ADDR_I]], align 4, !noalias [[META132]]
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__ADDR_LO_ADDR_I]], align 4, !noalias [[META132]]
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__ADDR_HI_ADDR_I]], align 4, !noalias [[META132]]
+// X86-NEXT:    store ptr [[TMP2]], ptr [[__DP_ADDR_I3]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__DP_ADDR_I3]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[TMP3]], align 1
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I2]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I2]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE_I]], align 16, !noalias [[META132]]
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE_I]], align 16, !noalias [[META132]]
+// X86-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__ADDR_LO_ADDR_I]], align 4, !noalias [[META132]]
+// X86-NEXT:    store ptr [[TMP7]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[TMP8]], align 1
+// X86-NEXT:    store <2 x double> [[TMP9]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP10]], ptr [[COERCE2_I]], align 16, !noalias [[META132]]
+// X86-NEXT:    [[TMP11:%.*]] = load <2 x double>, ptr [[COERCE2_I]], align 16, !noalias [[META132]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META135:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I1]], align 4, !noalias [[META135]]
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META135]]
+// X86-NEXT:    store <2 x double> [[TMP11]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META135]]
+// X86-NEXT:    [[TMP12:%.*]] = load <2 x double>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META135]]
+// X86-NEXT:    [[TMP13:%.*]] = load <2 x double>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META135]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP_I]], align 32, !alias.scope [[META135]]
+// X86-NEXT:    [[TMP14:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !alias.scope [[META135]]
+// X86-NEXT:    store <4 x double> [[TMP14]], ptr [[TMP_I]], align 32, !alias.scope [[META135]]
+// X86-NEXT:    [[TMP15:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !noalias [[META132]]
+// X86-NEXT:    store <4 x double> [[TMP15]], ptr [[TMP]], align 32, !alias.scope [[META132]]
+// X86-NEXT:    [[TMP16:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META132]]
+// X86-NEXT:    store <4 x double> [[TMP16]], ptr [[TMP]], align 32, !alias.scope [[META132]]
+// X86-NEXT:    [[TMP17:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP17]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP18:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP18]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_loadu2_m128d(double* A, double* B) {
-  // CHECK-LABEL: test_mm256_loadu2_m128d
-  // CHECK: load <2 x double>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: load <2 x double>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_loadu2_m128d(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_loadu2_m128i(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I2:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR_I1:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_HI_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_LO_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META138:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META138]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__ADDR_HI_ADDR_I]], align 4, !noalias [[META138]]
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__ADDR_LO_ADDR_I]], align 4, !noalias [[META138]]
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__ADDR_HI_ADDR_I]], align 4, !noalias [[META138]]
+// X86-NEXT:    store ptr [[TMP2]], ptr [[__P_ADDR_I2]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I2]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[TMP3]], align 1
+// X86-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__ADDR_LO_ADDR_I]], align 4, !noalias [[META138]]
+// X86-NEXT:    store ptr [[TMP5]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[TMP6]], align 1
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META141:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I1]], align 4, !noalias [[META141]]
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META141]]
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META141]]
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META141]]
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META141]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x i64> [[SHUFFLE_I]], ptr [[TMP_I]], align 32, !alias.scope [[META141]]
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META141]]
+// X86-NEXT:    store <4 x i64> [[TMP10]], ptr [[TMP_I]], align 32, !alias.scope [[META141]]
+// X86-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META138]]
+// X86-NEXT:    store <4 x i64> [[TMP11]], ptr [[TMP]], align 32, !alias.scope [[META138]]
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META138]]
+// X86-NEXT:    store <4 x i64> [[TMP12]], ptr [[TMP]], align 32, !alias.scope [[META138]]
+// X86-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP13]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP14:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP14]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_loadu2_m128i(__m128i* A, __m128i* B) {
-  // CHECK-LABEL: test_mm256_loadu2_m128i
-  // CHECK: load <2 x i64>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: load <2 x i64>, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_loadu2_m128i(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_maskload_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP2]], <2 x i64> [[TMP3]])
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_maskload_pd(double* A, __m128i B) {
-  // CHECK-LABEL: test_mm_maskload_pd
-  // CHECK: call <2 x double> @llvm.x86.avx.maskload.pd(ptr %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maskload_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_maskload_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META144:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META144]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META144]]
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 32, !noalias [[META144]]
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META144]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__M_ADDR_I]], align 32, !noalias [[META144]]
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP2]], <4 x i64> [[TMP3]])
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META144]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META144]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META144]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_maskload_pd(double* A, __m256i B) {
-  // CHECK-LABEL: test_mm256_maskload_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_maskload_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_maskload_ps(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <4 x i32>
+// X86-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP2]], <4 x i32> [[TMP4]])
+// X86-NEXT:    store <4 x float> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128 test_mm_maskload_ps(float* A, __m128i B) {
-  // CHECK-LABEL: test_mm_maskload_ps
-  // CHECK: call <4 x float> @llvm.x86.avx.maskload.ps(ptr %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maskload_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_maskload_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META147:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META147]]
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4, !noalias [[META147]]
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 32, !noalias [[META147]]
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4, !noalias [[META147]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__M_ADDR_I]], align 32, !noalias [[META147]]
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <8 x i32>
+// X86-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP2]], <8 x i32> [[TMP4]])
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META147]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META147]]
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META147]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_maskload_ps(float* A, __m256i B) {
-  // CHECK-LABEL: test_mm256_maskload_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_maskload_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_maskstore_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[C]], ptr [[C_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[C_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[TMP3]], <2 x i64> [[TMP4]], <2 x double> [[TMP5]])
+// X86-NEXT:    ret void
+//
 void test_mm_maskstore_pd(double* A, __m128i B, __m128d C) {
-  // CHECK-LABEL: test_mm_maskstore_pd
-  // CHECK: call void @llvm.x86.avx.maskstore.pd(ptr %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}})
   _mm_maskstore_pd(A, B, C);
 }
 
+//
+// X86-LABEL: define void @test_mm256_maskstore_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]], <4 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[C]], ptr [[C_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[C_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[__M_ADDR_I]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP3]], <4 x i64> [[TMP4]], <4 x double> [[TMP5]])
+// X86-NEXT:    ret void
+//
 void test_mm256_maskstore_pd(double* A, __m256i B, __m256d C) {
-  // CHECK-LABEL: test_mm256_maskstore_pd
-  // CHECK: call void @llvm.x86.avx.maskstore.pd.256(ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}})
   _mm256_maskstore_pd(A, B, C);
 }
 
+//
+// X86-LABEL: define void @test_mm_maskstore_ps(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__M_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[TMP3]], <4 x i32> [[TMP5]], <4 x float> [[TMP6]])
+// X86-NEXT:    ret void
+//
 void test_mm_maskstore_ps(float* A, __m128i B, __m128 C) {
-  // CHECK-LABEL: test_mm_maskstore_ps
-  // CHECK: call void @llvm.x86.avx.maskstore.ps(ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}})
   _mm_maskstore_ps(A, B, C);
 }
 
+//
+// X86-LABEL: define void @test_mm256_maskstore_ps(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__M_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[C]], ptr [[C_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__M_ADDR_I]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[__M_ADDR_I]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP4]] to <8 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP3]], <8 x i32> [[TMP5]], <8 x float> [[TMP6]])
+// X86-NEXT:    ret void
+//
 void test_mm256_maskstore_ps(float* A, __m256i B, __m256 C) {
-  // CHECK-LABEL: test_mm256_maskstore_ps
-  // CHECK: call void @llvm.x86.avx.maskstore.ps.256(ptr %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}})
   _mm256_maskstore_ps(A, B, C);
 }
 
+//
+// X86-LABEL: define void @test_mm256_max_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META150:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META150]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META150]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META150]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META150]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META150]]
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META150]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META150]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META150]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_max_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_max_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_max_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_max_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META153:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META153]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META153]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META153]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META153]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META153]]
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META153]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META153]]
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META153]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_max_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_max_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_max_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_min_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META156:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META156]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META156]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META156]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META156]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META156]]
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META156]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META156]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META156]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_min_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_min_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_min_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_min_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META159:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META159]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META159]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META159]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META159]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META159]]
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META159]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META159]]
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META159]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_min_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_min_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_min_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_movedup_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META162:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META162]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META162]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META162]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META162]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META162]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META162]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META162]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_movedup_pd(__m256d A) {
-  // CHECK-LABEL: test_mm256_movedup_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   return _mm256_movedup_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_movehdup_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META165:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META165]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META165]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META165]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META165]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META165]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META165]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META165]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_movehdup_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_movehdup_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   return _mm256_movehdup_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_moveldup_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META168:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META168]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META168]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META168]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META168]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META168]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META168]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META168]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_moveldup_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_moveldup_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   return _mm256_moveldup_ps(A);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_movemask_pd(
+// X86-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> [[TMP1]])
+// X86-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm256_movemask_pd(__m256d A) {
-  // CHECK-LABEL: test_mm256_movemask_pd
-  // CHECK: call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %{{.*}})
   return _mm256_movemask_pd(A);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_movemask_ps(
+// X86-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> [[TMP1]])
+// X86-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm256_movemask_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_movemask_ps
-  // CHECK: call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %{{.*}})
   return _mm256_movemask_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_mul_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META171:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META171]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META171]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META171]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META171]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META171]]
+// X86-NEXT:    [[MUL_I:%.*]] = fmul <4 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <4 x double> [[MUL_I]], ptr [[TMP]], align 32, !alias.scope [[META171]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META171]]
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META171]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_mul_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_mul_pd
-  // CHECK: fmul <4 x double>
   return _mm256_mul_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_mul_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META174:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META174]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META174]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META174]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META174]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META174]]
+// X86-NEXT:    [[MUL_I:%.*]] = fmul <8 x float> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <8 x float> [[MUL_I]], ptr [[TMP]], align 32, !alias.scope [[META174]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META174]]
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META174]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_mul_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_mul_ps
-  // CHECK: fmul <8 x float>
   return _mm256_mul_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_or_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META177:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META177]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META177]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META177]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META177]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META177]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+// X86-NEXT:    [[OR_I:%.*]] = or <4 x i64> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[OR_I]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META177]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META177]]
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META177]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_or_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_or_pd
-  // CHECK: or <4 x i64>
   return _mm256_or_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_or_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META180:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META180]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META180]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META180]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META180]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META180]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+// X86-NEXT:    [[OR_I:%.*]] = or <8 x i32> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[OR_I]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META180]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META180]]
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META180]]
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_or_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_or_ps
-  // CHECK: or <8 x i32>
   return _mm256_or_ps(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_permute_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[PERMIL:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+// X86-NEXT:    store <2 x double> [[PERMIL]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128d test_mm_permute_pd(__m128d A) {
-  // CHECK-LABEL: test_mm_permute_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <2 x i32> <i32 1, i32 0>
   return _mm_permute_pd(A, 1);
 }
 
+//
+// X86-LABEL: define void @test_mm256_permute_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// X86-NEXT:    store <4 x double> [[PERMIL]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_permute_pd(__m256d A) {
-  // CHECK-LABEL: test_mm256_permute_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   return _mm256_permute_pd(A, 5);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_permute_ps(
+// X86-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// X86-NEXT:    store <4 x float> [[PERMIL]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128 test_mm_permute_ps(__m128 A) {
-  // CHECK-LABEL: test_mm_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   return _mm_permute_ps(A, 0x1b);
 }
 
 // Test case for PR12401
+//
+// X86-LABEL: define <2 x i64> @test2_mm_permute_ps(
+// X86-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x float> [[PERMIL]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128 test2_mm_permute_ps(__m128 a) {
-  // CHECK-LABEL: test2_mm_permute_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
   return _mm_permute_ps(a, 0xe6);
 }
 
+//
+// X86-LABEL: define void @test_mm256_permute_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[PERMIL:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// X86-NEXT:    store <8 x float> [[PERMIL]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_permute_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_permute_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   return _mm256_permute_ps(A, 0x1b);
 }
 
+//
+// X86-LABEL: define void @test_mm256_permute2f128_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[VPERM:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+// X86-NEXT:    store <4 x double> [[VPERM]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_permute2f128_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   return _mm256_permute2f128_pd(A, B, 0x31);
 }
 
+//
+// X86-LABEL: define void @test_mm256_permute2f128_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[VPERM:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP0]], <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+// X86-NEXT:    store <8 x float> [[VPERM]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_permute2f128_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   return _mm256_permute2f128_ps(A, B, 0x13);
 }
 
+//
+// X86-LABEL: define void @test_mm256_permute2f128_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[TMP0]] to <8 x i32>
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[VPERM:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[VPERM]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
-  // CHECK-LABEL: test_mm256_permute2f128_si256
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm256_permute2f128_si256(A, B, 0x20);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_permutevar_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__C_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__C_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> [[TMP2]], <2 x i64> [[TMP3]])
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_permutevar_pd(__m128d A, __m128i B) {
-  // CHECK-LABEL: test_mm_permutevar_pd
-  // CHECK: call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_permutevar_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_permutevar_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META183:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META183]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META183]]
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__C_ADDR_I]], align 32, !noalias [[META183]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META183]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__C_ADDR_I]], align 32, !noalias [[META183]]
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> [[TMP2]], <4 x i64> [[TMP3]])
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META183]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META183]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META183]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_permutevar_pd(__m256d A, __m256i B) {
-  // CHECK-LABEL: test_mm256_permutevar_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_permutevar_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_permutevar_ps(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__C_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__C_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <4 x i32>
+// X86-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> [[TMP2]], <4 x i32> [[TMP4]])
+// X86-NEXT:    store <4 x float> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128 test_mm_permutevar_ps(__m128 A, __m128i B) {
-  // CHECK-LABEL: test_mm_permutevar_ps
-  // CHECK: call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_permutevar_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_permutevar_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META186:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META186]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META186]]
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__C_ADDR_I]], align 32, !noalias [[META186]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META186]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__C_ADDR_I]], align 32, !noalias [[META186]]
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP3]] to <8 x i32>
+// X86-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> [[TMP2]], <8 x i32> [[TMP4]])
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META186]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META186]]
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META186]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_permutevar_ps(__m256 A, __m256i B) {
-  // CHECK-LABEL: test_mm256_permutevar_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_permutevar_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_rcp_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META189:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META189]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META189]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META189]]
+// X86-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> [[TMP1]])
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META189]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META189]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META189]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_rcp_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_rcp_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %{{.*}})
   return _mm256_rcp_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_round_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[X:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> [[TMP0]], i32 4)
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_round_pd(__m256d x) {
-  // CHECK-LABEL: test_mm256_round_pd
-  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4)
   return _mm256_round_pd(x, 4);
 }
 
+//
+// X86-LABEL: define void @test_mm256_round_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[X]], ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[X_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> [[TMP0]], i32 4)
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_round_ps(__m256 x) {
-  // CHECK-LABEL: test_mm256_round_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4)
   return _mm256_round_ps(x, 4);
 }
 
+//
+// X86-LABEL: define void @test_mm256_rsqrt_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META192:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META192]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META192]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META192]]
+// X86-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> [[TMP1]])
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META192]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META192]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META192]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_rsqrt_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_rsqrt_ps
-  // CHECK: call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %{{.*}})
   return _mm256_rsqrt_ps(A);
 }
 
+//
 __m256i test_mm256_set_epi8(char A0, char A1, char A2, char A3, char A4, char A5, char A6, char A7,
                             char A8, char A9, char A10, char A11, char A12, char A13, char A14, char A15,
                             char A16, char A17, char A18, char A19, char A20, char A21, char A22, char A23,
                             char A24, char A25, char A26, char A27, char A28, char A29, char A30, char A31) {
-  // CHECK-LABEL: test_mm256_set_epi8
-  // CHECK: insertelement <32 x i8> poison, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
   return _mm256_set_epi8(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_epi16(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i16 noundef signext [[A0:%.*]], i16 noundef signext [[A1:%.*]], i16 noundef signext [[A2:%.*]], i16 noundef signext [[A3:%.*]], i16 noundef signext [[A4:%.*]], i16 noundef signext [[A5:%.*]], i16 noundef signext [[A6:%.*]], i16 noundef signext [[A7:%.*]], i16 noundef signext [[A8:%.*]], i16 noundef signext [[A9:%.*]], i16 noundef signext [[A10:%.*]], i16 noundef signext [[A11:%.*]], i16 noundef signext [[A12:%.*]], i16 noundef signext [[A13:%.*]], i16 noundef signext [[A14:%.*]], i16 noundef signext [[A15:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__W15_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W14_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W13_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W12_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W11_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W10_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W09_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W08_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W07_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W06_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W05_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W04_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W03_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W02_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W01_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W00_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x i16>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A4_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A5_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A6_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A7_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A8_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A9_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A10_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A11_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A12_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A13_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A14_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A15_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i16 [[A0]], ptr [[A0_ADDR]], align 2
+// X86-NEXT:    store i16 [[A1]], ptr [[A1_ADDR]], align 2
+// X86-NEXT:    store i16 [[A2]], ptr [[A2_ADDR]], align 2
+// X86-NEXT:    store i16 [[A3]], ptr [[A3_ADDR]], align 2
+// X86-NEXT:    store i16 [[A4]], ptr [[A4_ADDR]], align 2
+// X86-NEXT:    store i16 [[A5]], ptr [[A5_ADDR]], align 2
+// X86-NEXT:    store i16 [[A6]], ptr [[A6_ADDR]], align 2
+// X86-NEXT:    store i16 [[A7]], ptr [[A7_ADDR]], align 2
+// X86-NEXT:    store i16 [[A8]], ptr [[A8_ADDR]], align 2
+// X86-NEXT:    store i16 [[A9]], ptr [[A9_ADDR]], align 2
+// X86-NEXT:    store i16 [[A10]], ptr [[A10_ADDR]], align 2
+// X86-NEXT:    store i16 [[A11]], ptr [[A11_ADDR]], align 2
+// X86-NEXT:    store i16 [[A12]], ptr [[A12_ADDR]], align 2
+// X86-NEXT:    store i16 [[A13]], ptr [[A13_ADDR]], align 2
+// X86-NEXT:    store i16 [[A14]], ptr [[A14_ADDR]], align 2
+// X86-NEXT:    store i16 [[A15]], ptr [[A15_ADDR]], align 2
+// X86-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A0_ADDR]], align 2
+// X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A1_ADDR]], align 2
+// X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A2_ADDR]], align 2
+// X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A3_ADDR]], align 2
+// X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[A4_ADDR]], align 2
+// X86-NEXT:    [[TMP5:%.*]] = load i16, ptr [[A5_ADDR]], align 2
+// X86-NEXT:    [[TMP6:%.*]] = load i16, ptr [[A6_ADDR]], align 2
+// X86-NEXT:    [[TMP7:%.*]] = load i16, ptr [[A7_ADDR]], align 2
+// X86-NEXT:    [[TMP8:%.*]] = load i16, ptr [[A8_ADDR]], align 2
+// X86-NEXT:    [[TMP9:%.*]] = load i16, ptr [[A9_ADDR]], align 2
+// X86-NEXT:    [[TMP10:%.*]] = load i16, ptr [[A10_ADDR]], align 2
+// X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[A11_ADDR]], align 2
+// X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[A12_ADDR]], align 2
+// X86-NEXT:    [[TMP13:%.*]] = load i16, ptr [[A13_ADDR]], align 2
+// X86-NEXT:    [[TMP14:%.*]] = load i16, ptr [[A14_ADDR]], align 2
+// X86-NEXT:    [[TMP15:%.*]] = load i16, ptr [[A15_ADDR]], align 2
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META198:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP0]], ptr [[__W15_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP1]], ptr [[__W14_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP2]], ptr [[__W13_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP3]], ptr [[__W12_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP4]], ptr [[__W11_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP5]], ptr [[__W10_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP6]], ptr [[__W09_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP7]], ptr [[__W08_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP8]], ptr [[__W07_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP9]], ptr [[__W06_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP10]], ptr [[__W05_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP11]], ptr [[__W04_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP12]], ptr [[__W03_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP13]], ptr [[__W02_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP14]], ptr [[__W01_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    store i16 [[TMP15]], ptr [[__W00_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[TMP16:%.*]] = load i16, ptr [[__W00_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i16> poison, i16 [[TMP16]], i32 0
+// X86-NEXT:    [[TMP17:%.*]] = load i16, ptr [[__W01_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i16> [[VECINIT_I]], i16 [[TMP17]], i32 1
+// X86-NEXT:    [[TMP18:%.*]] = load i16, ptr [[__W02_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i16> [[VECINIT1_I]], i16 [[TMP18]], i32 2
+// X86-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__W03_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i16> [[VECINIT2_I]], i16 [[TMP19]], i32 3
+// X86-NEXT:    [[TMP20:%.*]] = load i16, ptr [[__W04_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i16> [[VECINIT3_I]], i16 [[TMP20]], i32 4
+// X86-NEXT:    [[TMP21:%.*]] = load i16, ptr [[__W05_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i16> [[VECINIT4_I]], i16 [[TMP21]], i32 5
+// X86-NEXT:    [[TMP22:%.*]] = load i16, ptr [[__W06_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i16> [[VECINIT5_I]], i16 [[TMP22]], i32 6
+// X86-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__W07_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i16> [[VECINIT6_I]], i16 [[TMP23]], i32 7
+// X86-NEXT:    [[TMP24:%.*]] = load i16, ptr [[__W08_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i16> [[VECINIT7_I]], i16 [[TMP24]], i32 8
+// X86-NEXT:    [[TMP25:%.*]] = load i16, ptr [[__W09_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i16> [[VECINIT8_I]], i16 [[TMP25]], i32 9
+// X86-NEXT:    [[TMP26:%.*]] = load i16, ptr [[__W10_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i16> [[VECINIT9_I]], i16 [[TMP26]], i32 10
+// X86-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__W11_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i16> [[VECINIT10_I]], i16 [[TMP27]], i32 11
+// X86-NEXT:    [[TMP28:%.*]] = load i16, ptr [[__W12_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i16> [[VECINIT11_I]], i16 [[TMP28]], i32 12
+// X86-NEXT:    [[TMP29:%.*]] = load i16, ptr [[__W13_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i16> [[VECINIT12_I]], i16 [[TMP29]], i32 13
+// X86-NEXT:    [[TMP30:%.*]] = load i16, ptr [[__W14_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i16> [[VECINIT13_I]], i16 [[TMP30]], i32 14
+// X86-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__W15_ADDR_I]], align 2, !noalias [[META198]]
+// X86-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i16> [[VECINIT14_I]], i16 [[TMP31]], i32 15
+// X86-NEXT:    store <16 x i16> [[VECINIT15_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META198]]
+// X86-NEXT:    [[TMP32:%.*]] = load <16 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META198]]
+// X86-NEXT:    [[TMP33:%.*]] = bitcast <16 x i16> [[TMP32]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP33]], ptr [[TMP]], align 32, !alias.scope [[META198]]
+// X86-NEXT:    [[TMP34:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META198]]
+// X86-NEXT:    store <4 x i64> [[TMP34]], ptr [[TMP]], align 32, !alias.scope [[META198]]
+// X86-NEXT:    [[TMP35:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP35]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP36:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP36]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_set_epi16(short A0, short A1, short A2, short A3, short A4, short A5, short A6, short A7,
                              short A8, short A9, short A10, short A11, short A12, short A13, short A14, short A15) {
-  // CHECK-LABEL: test_mm256_set_epi16
-  // CHECK: insertelement <16 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
   return _mm256_set_epi16(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_epi32(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i32 noundef [[A0:%.*]], i32 noundef [[A1:%.*]], i32 noundef [[A2:%.*]], i32 noundef [[A3:%.*]], i32 noundef [[A4:%.*]], i32 noundef [[A5:%.*]], i32 noundef [[A6:%.*]], i32 noundef [[A7:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__I0_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I3_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I4_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I5_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I6_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I7_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i32>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A4_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A5_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A6_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A7_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i32 [[A0]], ptr [[A0_ADDR]], align 4
+// X86-NEXT:    store i32 [[A1]], ptr [[A1_ADDR]], align 4
+// X86-NEXT:    store i32 [[A2]], ptr [[A2_ADDR]], align 4
+// X86-NEXT:    store i32 [[A3]], ptr [[A3_ADDR]], align 4
+// X86-NEXT:    store i32 [[A4]], ptr [[A4_ADDR]], align 4
+// X86-NEXT:    store i32 [[A5]], ptr [[A5_ADDR]], align 4
+// X86-NEXT:    store i32 [[A6]], ptr [[A6_ADDR]], align 4
+// X86-NEXT:    store i32 [[A7]], ptr [[A7_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A0_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A1_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A2_ADDR]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A3_ADDR]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A4_ADDR]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A5_ADDR]], align 4
+// X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A6_ADDR]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A7_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META201:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP0]], ptr [[__I0_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__I1_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP2]], ptr [[__I2_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP3]], ptr [[__I3_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP4]], ptr [[__I4_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP5]], ptr [[__I5_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP6]], ptr [[__I6_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    store i32 [[TMP7]], ptr [[__I7_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[TMP8:%.*]] = load i32, ptr [[__I7_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP8]], i32 0
+// X86-NEXT:    [[TMP9:%.*]] = load i32, ptr [[__I6_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i32> [[VECINIT_I]], i32 [[TMP9]], i32 1
+// X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[__I5_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i32> [[VECINIT1_I]], i32 [[TMP10]], i32 2
+// X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[__I4_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i32> [[VECINIT2_I]], i32 [[TMP11]], i32 3
+// X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[__I3_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i32> [[VECINIT3_I]], i32 [[TMP12]], i32 4
+// X86-NEXT:    [[TMP13:%.*]] = load i32, ptr [[__I2_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i32> [[VECINIT4_I]], i32 [[TMP13]], i32 5
+// X86-NEXT:    [[TMP14:%.*]] = load i32, ptr [[__I1_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i32> [[VECINIT5_I]], i32 [[TMP14]], i32 6
+// X86-NEXT:    [[TMP15:%.*]] = load i32, ptr [[__I0_ADDR_I]], align 4, !noalias [[META201]]
+// X86-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i32> [[VECINIT6_I]], i32 [[TMP15]], i32 7
+// X86-NEXT:    store <8 x i32> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META201]]
+// X86-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META201]]
+// X86-NEXT:    [[TMP17:%.*]] = bitcast <8 x i32> [[TMP16]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP]], align 32, !alias.scope [[META201]]
+// X86-NEXT:    [[TMP18:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META201]]
+// X86-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP]], align 32, !alias.scope [[META201]]
+// X86-NEXT:    [[TMP19:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP19]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP20:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP20]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_set_epi32(int A0, int A1, int A2, int A3, int A4, int A5, int A6, int A7) {
-  // CHECK-LABEL: test_mm256_set_epi32
-  // CHECK: insertelement <8 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
   return _mm256_set_epi32(A0, A1, A2, A3, A4, A5, A6, A7);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_epi64x(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i64 noundef [[A0:%.*]], i64 noundef [[A1:%.*]], i64 noundef [[A2:%.*]], i64 noundef [[A3:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__D_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i64 [[A0]], ptr [[A0_ADDR]], align 8
+// X86-NEXT:    store i64 [[A1]], ptr [[A1_ADDR]], align 8
+// X86-NEXT:    store i64 [[A2]], ptr [[A2_ADDR]], align 8
+// X86-NEXT:    store i64 [[A3]], ptr [[A3_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A0_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1_ADDR]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A2_ADDR]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[A3_ADDR]], align 8
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META204:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META204]]
+// X86-NEXT:    store i64 [[TMP0]], ptr [[__A_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    store i64 [[TMP1]], ptr [[__B_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    store i64 [[TMP2]], ptr [[__C_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    store i64 [[TMP3]], ptr [[__D_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__D_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i32 0
+// X86-NEXT:    [[TMP5:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i64> [[VECINIT_I]], i64 [[TMP5]], i32 1
+// X86-NEXT:    [[TMP6:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i64> [[VECINIT1_I]], i64 [[TMP6]], i32 2
+// X86-NEXT:    [[TMP7:%.*]] = load i64, ptr [[__A_ADDR_I]], align 8, !noalias [[META204]]
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i64> [[VECINIT2_I]], i64 [[TMP7]], i32 3
+// X86-NEXT:    store <4 x i64> [[VECINIT3_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META204]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META204]]
+// X86-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META204]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META204]]
+// X86-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP]], align 32, !alias.scope [[META204]]
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP11]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_set_epi64x(long long A0, long long A1, long long A2, long long A3) {
-  // CHECK-LABEL: test_mm256_set_epi64x
-  // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
   return _mm256_set_epi64x(A0, A1, A2, A3);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_m128(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META207:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META207]]
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META207]]
+// X86-NEXT:    store <4 x float> [[TMP1]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META207]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META207]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META207]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META207]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META207]]
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META207]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_set_m128(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm256_set_m128
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_set_m128(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_m128d(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META210:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META210]]
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META210]]
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META210]]
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META210]]
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META210]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META210]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META210]]
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META210]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_set_m128d(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm256_set_m128d
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_set_m128d(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_m128i(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META213:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META213]]
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META213]]
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META213]]
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META213]]
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META213]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x i64> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META213]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META213]]
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META213]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_set_m128i(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm256_set_m128i
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_set_m128i(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], double noundef [[A0:%.*]], double noundef [[A1:%.*]], double noundef [[A2:%.*]], double noundef [[A3:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__D_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store double [[A0]], ptr [[A0_ADDR]], align 8
+// X86-NEXT:    store double [[A1]], ptr [[A1_ADDR]], align 8
+// X86-NEXT:    store double [[A2]], ptr [[A2_ADDR]], align 8
+// X86-NEXT:    store double [[A3]], ptr [[A3_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A0_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[A1_ADDR]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[A2_ADDR]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[A3_ADDR]], align 8
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META216:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META216]]
+// X86-NEXT:    store double [[TMP0]], ptr [[__A_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    store double [[TMP1]], ptr [[__B_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    store double [[TMP2]], ptr [[__C_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    store double [[TMP3]], ptr [[__D_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    [[TMP4:%.*]] = load double, ptr [[__D_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x double> poison, double [[TMP4]], i32 0
+// X86-NEXT:    [[TMP5:%.*]] = load double, ptr [[__C_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x double> [[VECINIT_I]], double [[TMP5]], i32 1
+// X86-NEXT:    [[TMP6:%.*]] = load double, ptr [[__B_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x double> [[VECINIT1_I]], double [[TMP6]], i32 2
+// X86-NEXT:    [[TMP7:%.*]] = load double, ptr [[__A_ADDR_I]], align 8, !noalias [[META216]]
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x double> [[VECINIT2_I]], double [[TMP7]], i32 3
+// X86-NEXT:    store <4 x double> [[VECINIT3_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META216]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META216]]
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META216]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META216]]
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[TMP]], align 32, !alias.scope [[META216]]
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP11:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP11]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_set_pd(double A0, double A1, double A2, double A3) {
-  // CHECK-LABEL: test_mm256_set_pd
-  // CHECK: insertelement <4 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
   return _mm256_set_pd(A0, A1, A2, A3);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], float noundef [[A0:%.*]], float noundef [[A1:%.*]], float noundef [[A2:%.*]], float noundef [[A3:%.*]], float noundef [[A4:%.*]], float noundef [[A5:%.*]], float noundef [[A6:%.*]], float noundef [[A7:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__D_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__E_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__F_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__G_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__H_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A4_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A5_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A6_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A7_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store float [[A0]], ptr [[A0_ADDR]], align 4
+// X86-NEXT:    store float [[A1]], ptr [[A1_ADDR]], align 4
+// X86-NEXT:    store float [[A2]], ptr [[A2_ADDR]], align 4
+// X86-NEXT:    store float [[A3]], ptr [[A3_ADDR]], align 4
+// X86-NEXT:    store float [[A4]], ptr [[A4_ADDR]], align 4
+// X86-NEXT:    store float [[A5]], ptr [[A5_ADDR]], align 4
+// X86-NEXT:    store float [[A6]], ptr [[A6_ADDR]], align 4
+// X86-NEXT:    store float [[A7]], ptr [[A7_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load float, ptr [[A0_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load float, ptr [[A1_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load float, ptr [[A2_ADDR]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load float, ptr [[A3_ADDR]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load float, ptr [[A4_ADDR]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load float, ptr [[A5_ADDR]], align 4
+// X86-NEXT:    [[TMP6:%.*]] = load float, ptr [[A6_ADDR]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load float, ptr [[A7_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META219:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP0]], ptr [[__A_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP1]], ptr [[__B_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP2]], ptr [[__C_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP3]], ptr [[__D_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP4]], ptr [[__E_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP5]], ptr [[__F_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP6]], ptr [[__G_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    store float [[TMP7]], ptr [[__H_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[TMP8:%.*]] = load float, ptr [[__H_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x float> poison, float [[TMP8]], i32 0
+// X86-NEXT:    [[TMP9:%.*]] = load float, ptr [[__G_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x float> [[VECINIT_I]], float [[TMP9]], i32 1
+// X86-NEXT:    [[TMP10:%.*]] = load float, ptr [[__F_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x float> [[VECINIT1_I]], float [[TMP10]], i32 2
+// X86-NEXT:    [[TMP11:%.*]] = load float, ptr [[__E_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x float> [[VECINIT2_I]], float [[TMP11]], i32 3
+// X86-NEXT:    [[TMP12:%.*]] = load float, ptr [[__D_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x float> [[VECINIT3_I]], float [[TMP12]], i32 4
+// X86-NEXT:    [[TMP13:%.*]] = load float, ptr [[__C_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x float> [[VECINIT4_I]], float [[TMP13]], i32 5
+// X86-NEXT:    [[TMP14:%.*]] = load float, ptr [[__B_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x float> [[VECINIT5_I]], float [[TMP14]], i32 6
+// X86-NEXT:    [[TMP15:%.*]] = load float, ptr [[__A_ADDR_I]], align 4, !noalias [[META219]]
+// X86-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x float> [[VECINIT6_I]], float [[TMP15]], i32 7
+// X86-NEXT:    store <8 x float> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META219]]
+// X86-NEXT:    [[TMP16:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META219]]
+// X86-NEXT:    store <8 x float> [[TMP16]], ptr [[TMP]], align 32, !alias.scope [[META219]]
+// X86-NEXT:    [[TMP17:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META219]]
+// X86-NEXT:    store <8 x float> [[TMP17]], ptr [[TMP]], align 32, !alias.scope [[META219]]
+// X86-NEXT:    [[TMP18:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP18]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP19:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP19]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_set_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
-  // CHECK-LABEL: test_mm256_set_ps
-  // CHECK: insertelement <8 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
   return _mm256_set_ps(A0, A1, A2, A3, A4, A5, A6, A7);
 }
 
+//
 __m256i test_mm256_set1_epi8(char A) {
-  // CHECK-LABEL: test_mm256_set1_epi8
-  // CHECK: insertelement <32 x i8> poison, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
   return _mm256_set1_epi8(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set1_epi16(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__W15_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W14_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W13_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W12_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W11_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W10_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W09_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W08_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W07_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W06_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W05_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W04_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W03_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W02_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W01_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W00_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i16>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// X86-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META229:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META229]]
+// X86-NEXT:    store i16 [[TMP0]], ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP7:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP8:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP13:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP14:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    [[TMP16:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2, !noalias [[META229]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META232:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META235:![0-9]+]]
+// X86-NEXT:    store i16 [[TMP1]], ptr [[__W15_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP2]], ptr [[__W14_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP3]], ptr [[__W13_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP4]], ptr [[__W12_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP5]], ptr [[__W11_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP6]], ptr [[__W10_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP7]], ptr [[__W09_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP8]], ptr [[__W08_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP9]], ptr [[__W07_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP10]], ptr [[__W06_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP11]], ptr [[__W05_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP12]], ptr [[__W04_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP13]], ptr [[__W03_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP14]], ptr [[__W02_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP15]], ptr [[__W01_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    store i16 [[TMP16]], ptr [[__W00_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[TMP17:%.*]] = load i16, ptr [[__W00_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i16> poison, i16 [[TMP17]], i32 0
+// X86-NEXT:    [[TMP18:%.*]] = load i16, ptr [[__W01_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i16> [[VECINIT_I_I]], i16 [[TMP18]], i32 1
+// X86-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__W02_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i16> [[VECINIT1_I_I]], i16 [[TMP19]], i32 2
+// X86-NEXT:    [[TMP20:%.*]] = load i16, ptr [[__W03_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i16> [[VECINIT2_I_I]], i16 [[TMP20]], i32 3
+// X86-NEXT:    [[TMP21:%.*]] = load i16, ptr [[__W04_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i16> [[VECINIT3_I_I]], i16 [[TMP21]], i32 4
+// X86-NEXT:    [[TMP22:%.*]] = load i16, ptr [[__W05_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i16> [[VECINIT4_I_I]], i16 [[TMP22]], i32 5
+// X86-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__W06_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i16> [[VECINIT5_I_I]], i16 [[TMP23]], i32 6
+// X86-NEXT:    [[TMP24:%.*]] = load i16, ptr [[__W07_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i16> [[VECINIT6_I_I]], i16 [[TMP24]], i32 7
+// X86-NEXT:    [[TMP25:%.*]] = load i16, ptr [[__W08_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i16> [[VECINIT7_I_I]], i16 [[TMP25]], i32 8
+// X86-NEXT:    [[TMP26:%.*]] = load i16, ptr [[__W09_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i16> [[VECINIT8_I_I]], i16 [[TMP26]], i32 9
+// X86-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__W10_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i16> [[VECINIT9_I_I]], i16 [[TMP27]], i32 10
+// X86-NEXT:    [[TMP28:%.*]] = load i16, ptr [[__W11_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i16> [[VECINIT10_I_I]], i16 [[TMP28]], i32 11
+// X86-NEXT:    [[TMP29:%.*]] = load i16, ptr [[__W12_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i16> [[VECINIT11_I_I]], i16 [[TMP29]], i32 12
+// X86-NEXT:    [[TMP30:%.*]] = load i16, ptr [[__W13_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i16> [[VECINIT12_I_I]], i16 [[TMP30]], i32 13
+// X86-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__W14_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i16> [[VECINIT13_I_I]], i16 [[TMP31]], i32 14
+// X86-NEXT:    [[TMP32:%.*]] = load i16, ptr [[__W15_ADDR_I_I]], align 2, !noalias [[META235]]
+// X86-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i16> [[VECINIT14_I_I]], i16 [[TMP32]], i32 15
+// X86-NEXT:    store <16 x i16> [[VECINIT15_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META235]]
+// X86-NEXT:    [[TMP33:%.*]] = load <16 x i16>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META235]]
+// X86-NEXT:    [[TMP34:%.*]] = bitcast <16 x i16> [[TMP33]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP34]], ptr [[TMP_I]], align 32, !alias.scope [[META232]], !noalias [[META229]]
+// X86-NEXT:    [[TMP35:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META232]], !noalias [[META229]]
+// X86-NEXT:    store <4 x i64> [[TMP35]], ptr [[TMP_I]], align 32, !alias.scope [[META232]], !noalias [[META229]]
+// X86-NEXT:    [[TMP36:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META229]]
+// X86-NEXT:    store <4 x i64> [[TMP36]], ptr [[TMP]], align 32, !alias.scope [[META229]]
+// X86-NEXT:    [[TMP37:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META229]]
+// X86-NEXT:    store <4 x i64> [[TMP37]], ptr [[TMP]], align 32, !alias.scope [[META229]]
+// X86-NEXT:    [[TMP38:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP38]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP39:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP39]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_set1_epi16(short A) {
-  // CHECK-LABEL: test_mm256_set1_epi16
-  // CHECK: insertelement <16 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
   return _mm256_set1_epi16(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set1_epi32(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__I0_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I3_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I4_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I5_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I6_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I7_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i32>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__I_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META236:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    store i32 [[TMP0]], ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    [[TMP8:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4, !noalias [[META236]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META239:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META242:![0-9]+]]
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__I0_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    store i32 [[TMP2]], ptr [[__I1_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    store i32 [[TMP3]], ptr [[__I2_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    store i32 [[TMP4]], ptr [[__I3_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    store i32 [[TMP5]], ptr [[__I4_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    store i32 [[TMP6]], ptr [[__I5_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    store i32 [[TMP7]], ptr [[__I6_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    store i32 [[TMP8]], ptr [[__I7_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[TMP9:%.*]] = load i32, ptr [[__I7_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP9]], i32 0
+// X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[__I6_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i32> [[VECINIT_I_I]], i32 [[TMP10]], i32 1
+// X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[__I5_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i32> [[VECINIT1_I_I]], i32 [[TMP11]], i32 2
+// X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[__I4_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i32> [[VECINIT2_I_I]], i32 [[TMP12]], i32 3
+// X86-NEXT:    [[TMP13:%.*]] = load i32, ptr [[__I3_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i32> [[VECINIT3_I_I]], i32 [[TMP13]], i32 4
+// X86-NEXT:    [[TMP14:%.*]] = load i32, ptr [[__I2_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i32> [[VECINIT4_I_I]], i32 [[TMP14]], i32 5
+// X86-NEXT:    [[TMP15:%.*]] = load i32, ptr [[__I1_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i32> [[VECINIT5_I_I]], i32 [[TMP15]], i32 6
+// X86-NEXT:    [[TMP16:%.*]] = load i32, ptr [[__I0_ADDR_I_I]], align 4, !noalias [[META242]]
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i32> [[VECINIT6_I_I]], i32 [[TMP16]], i32 7
+// X86-NEXT:    store <8 x i32> [[VECINIT7_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META242]]
+// X86-NEXT:    [[TMP17:%.*]] = load <8 x i32>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META242]]
+// X86-NEXT:    [[TMP18:%.*]] = bitcast <8 x i32> [[TMP17]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP_I]], align 32, !alias.scope [[META239]], !noalias [[META236]]
+// X86-NEXT:    [[TMP19:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META239]], !noalias [[META236]]
+// X86-NEXT:    store <4 x i64> [[TMP19]], ptr [[TMP_I]], align 32, !alias.scope [[META239]], !noalias [[META236]]
+// X86-NEXT:    [[TMP20:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META236]]
+// X86-NEXT:    store <4 x i64> [[TMP20]], ptr [[TMP]], align 32, !alias.scope [[META236]]
+// X86-NEXT:    [[TMP21:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META236]]
+// X86-NEXT:    store <4 x i64> [[TMP21]], ptr [[TMP]], align 32, !alias.scope [[META236]]
+// X86-NEXT:    [[TMP22:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP22]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP23:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP23]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_set1_epi32(int A) {
-  // CHECK-LABEL: test_mm256_set1_epi32
-  // CHECK: insertelement <8 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
   return _mm256_set1_epi32(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set1_epi64x(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__C_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__Q_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_ADDR]], align 8
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META243:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META243]]
+// X86-NEXT:    store i64 [[TMP0]], ptr [[__Q_ADDR_I]], align 8, !noalias [[META243]]
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__Q_ADDR_I]], align 8, !noalias [[META243]]
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__Q_ADDR_I]], align 8, !noalias [[META243]]
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[__Q_ADDR_I]], align 8, !noalias [[META243]]
+// X86-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__Q_ADDR_I]], align 8, !noalias [[META243]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META246:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META249:![0-9]+]]
+// X86-NEXT:    store i64 [[TMP1]], ptr [[__A_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    store i64 [[TMP2]], ptr [[__B_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    store i64 [[TMP3]], ptr [[__C_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    store i64 [[TMP4]], ptr [[__D_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    [[TMP5:%.*]] = load i64, ptr [[__D_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i32 0
+// X86-NEXT:    [[TMP6:%.*]] = load i64, ptr [[__C_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i64> [[VECINIT_I_I]], i64 [[TMP6]], i32 1
+// X86-NEXT:    [[TMP7:%.*]] = load i64, ptr [[__B_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i64> [[VECINIT1_I_I]], i64 [[TMP7]], i32 2
+// X86-NEXT:    [[TMP8:%.*]] = load i64, ptr [[__A_ADDR_I_I]], align 8, !noalias [[META249]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i64> [[VECINIT2_I_I]], i64 [[TMP8]], i32 3
+// X86-NEXT:    store <4 x i64> [[VECINIT3_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META249]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META249]]
+// X86-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP_I]], align 32, !alias.scope [[META246]], !noalias [[META243]]
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META246]], !noalias [[META243]]
+// X86-NEXT:    store <4 x i64> [[TMP10]], ptr [[TMP_I]], align 32, !alias.scope [[META246]], !noalias [[META243]]
+// X86-NEXT:    [[TMP11:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META243]]
+// X86-NEXT:    store <4 x i64> [[TMP11]], ptr [[TMP]], align 32, !alias.scope [[META243]]
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META243]]
+// X86-NEXT:    store <4 x i64> [[TMP12]], ptr [[TMP]], align 32, !alias.scope [[META243]]
+// X86-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP13]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP14:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP14]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_set1_epi64x(long long A) {
-  // CHECK-LABEL: test_mm256_set1_epi64x
-  // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
   return _mm256_set1_epi64x(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set1_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], double noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__C_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store double [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A_ADDR]], align 8
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META250:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META250]]
+// X86-NEXT:    store double [[TMP0]], ptr [[__W_ADDR_I]], align 8, !noalias [[META250]]
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[__W_ADDR_I]], align 8, !noalias [[META250]]
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[__W_ADDR_I]], align 8, !noalias [[META250]]
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__W_ADDR_I]], align 8, !noalias [[META250]]
+// X86-NEXT:    [[TMP4:%.*]] = load double, ptr [[__W_ADDR_I]], align 8, !noalias [[META250]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META253:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META256:![0-9]+]]
+// X86-NEXT:    store double [[TMP1]], ptr [[__A_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    store double [[TMP2]], ptr [[__B_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    store double [[TMP3]], ptr [[__C_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    store double [[TMP4]], ptr [[__D_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    [[TMP5:%.*]] = load double, ptr [[__D_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x double> poison, double [[TMP5]], i32 0
+// X86-NEXT:    [[TMP6:%.*]] = load double, ptr [[__C_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x double> [[VECINIT_I_I]], double [[TMP6]], i32 1
+// X86-NEXT:    [[TMP7:%.*]] = load double, ptr [[__B_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x double> [[VECINIT1_I_I]], double [[TMP7]], i32 2
+// X86-NEXT:    [[TMP8:%.*]] = load double, ptr [[__A_ADDR_I_I]], align 8, !noalias [[META256]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x double> [[VECINIT2_I_I]], double [[TMP8]], i32 3
+// X86-NEXT:    store <4 x double> [[VECINIT3_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META256]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META256]]
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[TMP_I]], align 32, !alias.scope [[META253]], !noalias [[META250]]
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !alias.scope [[META253]], !noalias [[META250]]
+// X86-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP_I]], align 32, !alias.scope [[META253]], !noalias [[META250]]
+// X86-NEXT:    [[TMP11:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !noalias [[META250]]
+// X86-NEXT:    store <4 x double> [[TMP11]], ptr [[TMP]], align 32, !alias.scope [[META250]]
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META250]]
+// X86-NEXT:    store <4 x double> [[TMP12]], ptr [[TMP]], align 32, !alias.scope [[META250]]
+// X86-NEXT:    [[TMP13:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP13]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP14:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP14]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_set1_pd(double A) {
-  // CHECK-LABEL: test_mm256_set1_pd
-  // CHECK: insertelement <4 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
   return _mm256_set1_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_set1_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], float noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__C_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__E_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__F_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__G_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__H_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store float [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META257:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    store float [[TMP0]], ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP1:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP2:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP3:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP4:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP5:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP6:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP7:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    [[TMP8:%.*]] = load float, ptr [[__W_ADDR_I]], align 4, !noalias [[META257]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META260:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META263:![0-9]+]]
+// X86-NEXT:    store float [[TMP1]], ptr [[__A_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    store float [[TMP2]], ptr [[__B_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    store float [[TMP3]], ptr [[__C_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    store float [[TMP4]], ptr [[__D_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    store float [[TMP5]], ptr [[__E_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    store float [[TMP6]], ptr [[__F_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    store float [[TMP7]], ptr [[__G_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    store float [[TMP8]], ptr [[__H_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[TMP9:%.*]] = load float, ptr [[__H_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x float> poison, float [[TMP9]], i32 0
+// X86-NEXT:    [[TMP10:%.*]] = load float, ptr [[__G_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x float> [[VECINIT_I_I]], float [[TMP10]], i32 1
+// X86-NEXT:    [[TMP11:%.*]] = load float, ptr [[__F_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x float> [[VECINIT1_I_I]], float [[TMP11]], i32 2
+// X86-NEXT:    [[TMP12:%.*]] = load float, ptr [[__E_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x float> [[VECINIT2_I_I]], float [[TMP12]], i32 3
+// X86-NEXT:    [[TMP13:%.*]] = load float, ptr [[__D_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x float> [[VECINIT3_I_I]], float [[TMP13]], i32 4
+// X86-NEXT:    [[TMP14:%.*]] = load float, ptr [[__C_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x float> [[VECINIT4_I_I]], float [[TMP14]], i32 5
+// X86-NEXT:    [[TMP15:%.*]] = load float, ptr [[__B_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x float> [[VECINIT5_I_I]], float [[TMP15]], i32 6
+// X86-NEXT:    [[TMP16:%.*]] = load float, ptr [[__A_ADDR_I_I]], align 4, !noalias [[META263]]
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x float> [[VECINIT6_I_I]], float [[TMP16]], i32 7
+// X86-NEXT:    store <8 x float> [[VECINIT7_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META263]]
+// X86-NEXT:    [[TMP17:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META263]]
+// X86-NEXT:    store <8 x float> [[TMP17]], ptr [[TMP_I]], align 32, !alias.scope [[META260]], !noalias [[META257]]
+// X86-NEXT:    [[TMP18:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !alias.scope [[META260]], !noalias [[META257]]
+// X86-NEXT:    store <8 x float> [[TMP18]], ptr [[TMP_I]], align 32, !alias.scope [[META260]], !noalias [[META257]]
+// X86-NEXT:    [[TMP19:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !noalias [[META257]]
+// X86-NEXT:    store <8 x float> [[TMP19]], ptr [[TMP]], align 32, !alias.scope [[META257]]
+// X86-NEXT:    [[TMP20:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META257]]
+// X86-NEXT:    store <8 x float> [[TMP20]], ptr [[TMP]], align 32, !alias.scope [[META257]]
+// X86-NEXT:    [[TMP21:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP21]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP22:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP22]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_set1_ps(float A) {
-  // CHECK-LABEL: test_mm256_set1_ps
-  // CHECK: insertelement <8 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
   return _mm256_set1_ps(A);
 }
 
+//
 __m256i test_mm256_setr_epi8(char A0, char A1, char A2, char A3, char A4, char A5, char A6, char A7,
                              char A8, char A9, char A10, char A11, char A12, char A13, char A14, char A15,
                              char A16, char A17, char A18, char A19, char A20, char A21, char A22, char A23,
                              char A24, char A25, char A26, char A27, char A28, char A29, char A30, char A31) {
-  // CHECK-LABEL: test_mm256_setr_epi8
-  // CHECK: insertelement <32 x i8> poison, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
-  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
   return _mm256_setr_epi8(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_epi16(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i16 noundef signext [[A0:%.*]], i16 noundef signext [[A1:%.*]], i16 noundef signext [[A2:%.*]], i16 noundef signext [[A3:%.*]], i16 noundef signext [[A4:%.*]], i16 noundef signext [[A5:%.*]], i16 noundef signext [[A6:%.*]], i16 noundef signext [[A7:%.*]], i16 noundef signext [[A8:%.*]], i16 noundef signext [[A9:%.*]], i16 noundef signext [[A10:%.*]], i16 noundef signext [[A11:%.*]], i16 noundef signext [[A12:%.*]], i16 noundef signext [[A13:%.*]], i16 noundef signext [[A14:%.*]], i16 noundef signext [[A15:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__W15_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W14_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W13_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W12_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W11_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W10_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W09_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W08_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W07_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W06_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W05_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W04_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W03_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W02_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W01_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W00_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x i16>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__W15_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W14_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W13_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W12_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W11_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W10_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W09_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W08_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W07_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W06_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W05_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W04_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W03_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W02_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W01_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W00_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A4_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A5_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A6_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A7_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A8_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A9_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A10_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A11_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A12_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A13_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A14_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A15_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i16 [[A0]], ptr [[A0_ADDR]], align 2
+// X86-NEXT:    store i16 [[A1]], ptr [[A1_ADDR]], align 2
+// X86-NEXT:    store i16 [[A2]], ptr [[A2_ADDR]], align 2
+// X86-NEXT:    store i16 [[A3]], ptr [[A3_ADDR]], align 2
+// X86-NEXT:    store i16 [[A4]], ptr [[A4_ADDR]], align 2
+// X86-NEXT:    store i16 [[A5]], ptr [[A5_ADDR]], align 2
+// X86-NEXT:    store i16 [[A6]], ptr [[A6_ADDR]], align 2
+// X86-NEXT:    store i16 [[A7]], ptr [[A7_ADDR]], align 2
+// X86-NEXT:    store i16 [[A8]], ptr [[A8_ADDR]], align 2
+// X86-NEXT:    store i16 [[A9]], ptr [[A9_ADDR]], align 2
+// X86-NEXT:    store i16 [[A10]], ptr [[A10_ADDR]], align 2
+// X86-NEXT:    store i16 [[A11]], ptr [[A11_ADDR]], align 2
+// X86-NEXT:    store i16 [[A12]], ptr [[A12_ADDR]], align 2
+// X86-NEXT:    store i16 [[A13]], ptr [[A13_ADDR]], align 2
+// X86-NEXT:    store i16 [[A14]], ptr [[A14_ADDR]], align 2
+// X86-NEXT:    store i16 [[A15]], ptr [[A15_ADDR]], align 2
+// X86-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A0_ADDR]], align 2
+// X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[A1_ADDR]], align 2
+// X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A2_ADDR]], align 2
+// X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A3_ADDR]], align 2
+// X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[A4_ADDR]], align 2
+// X86-NEXT:    [[TMP5:%.*]] = load i16, ptr [[A5_ADDR]], align 2
+// X86-NEXT:    [[TMP6:%.*]] = load i16, ptr [[A6_ADDR]], align 2
+// X86-NEXT:    [[TMP7:%.*]] = load i16, ptr [[A7_ADDR]], align 2
+// X86-NEXT:    [[TMP8:%.*]] = load i16, ptr [[A8_ADDR]], align 2
+// X86-NEXT:    [[TMP9:%.*]] = load i16, ptr [[A9_ADDR]], align 2
+// X86-NEXT:    [[TMP10:%.*]] = load i16, ptr [[A10_ADDR]], align 2
+// X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[A11_ADDR]], align 2
+// X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[A12_ADDR]], align 2
+// X86-NEXT:    [[TMP13:%.*]] = load i16, ptr [[A13_ADDR]], align 2
+// X86-NEXT:    [[TMP14:%.*]] = load i16, ptr [[A14_ADDR]], align 2
+// X86-NEXT:    [[TMP15:%.*]] = load i16, ptr [[A15_ADDR]], align 2
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META271:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP0]], ptr [[__W15_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP1]], ptr [[__W14_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP2]], ptr [[__W13_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP3]], ptr [[__W12_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP4]], ptr [[__W11_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP5]], ptr [[__W10_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP6]], ptr [[__W09_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP7]], ptr [[__W08_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP8]], ptr [[__W07_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP9]], ptr [[__W06_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP10]], ptr [[__W05_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP11]], ptr [[__W04_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP12]], ptr [[__W03_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP13]], ptr [[__W02_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP14]], ptr [[__W01_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    store i16 [[TMP15]], ptr [[__W00_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP16:%.*]] = load i16, ptr [[__W00_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP17:%.*]] = load i16, ptr [[__W01_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP18:%.*]] = load i16, ptr [[__W02_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__W03_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP20:%.*]] = load i16, ptr [[__W04_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP21:%.*]] = load i16, ptr [[__W05_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP22:%.*]] = load i16, ptr [[__W06_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__W07_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP24:%.*]] = load i16, ptr [[__W08_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP25:%.*]] = load i16, ptr [[__W09_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP26:%.*]] = load i16, ptr [[__W10_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP27:%.*]] = load i16, ptr [[__W11_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP28:%.*]] = load i16, ptr [[__W12_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP29:%.*]] = load i16, ptr [[__W13_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP30:%.*]] = load i16, ptr [[__W14_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    [[TMP31:%.*]] = load i16, ptr [[__W15_ADDR_I]], align 2, !noalias [[META271]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META274:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META277:![0-9]+]]
+// X86-NEXT:    store i16 [[TMP16]], ptr [[__W15_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP17]], ptr [[__W14_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP18]], ptr [[__W13_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP19]], ptr [[__W12_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP20]], ptr [[__W11_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP21]], ptr [[__W10_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP22]], ptr [[__W09_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP23]], ptr [[__W08_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP24]], ptr [[__W07_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP25]], ptr [[__W06_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP26]], ptr [[__W05_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP27]], ptr [[__W04_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP28]], ptr [[__W03_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP29]], ptr [[__W02_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP30]], ptr [[__W01_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    store i16 [[TMP31]], ptr [[__W00_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[TMP32:%.*]] = load i16, ptr [[__W00_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <16 x i16> poison, i16 [[TMP32]], i32 0
+// X86-NEXT:    [[TMP33:%.*]] = load i16, ptr [[__W01_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <16 x i16> [[VECINIT_I_I]], i16 [[TMP33]], i32 1
+// X86-NEXT:    [[TMP34:%.*]] = load i16, ptr [[__W02_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <16 x i16> [[VECINIT1_I_I]], i16 [[TMP34]], i32 2
+// X86-NEXT:    [[TMP35:%.*]] = load i16, ptr [[__W03_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <16 x i16> [[VECINIT2_I_I]], i16 [[TMP35]], i32 3
+// X86-NEXT:    [[TMP36:%.*]] = load i16, ptr [[__W04_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <16 x i16> [[VECINIT3_I_I]], i16 [[TMP36]], i32 4
+// X86-NEXT:    [[TMP37:%.*]] = load i16, ptr [[__W05_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <16 x i16> [[VECINIT4_I_I]], i16 [[TMP37]], i32 5
+// X86-NEXT:    [[TMP38:%.*]] = load i16, ptr [[__W06_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <16 x i16> [[VECINIT5_I_I]], i16 [[TMP38]], i32 6
+// X86-NEXT:    [[TMP39:%.*]] = load i16, ptr [[__W07_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <16 x i16> [[VECINIT6_I_I]], i16 [[TMP39]], i32 7
+// X86-NEXT:    [[TMP40:%.*]] = load i16, ptr [[__W08_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT8_I_I:%.*]] = insertelement <16 x i16> [[VECINIT7_I_I]], i16 [[TMP40]], i32 8
+// X86-NEXT:    [[TMP41:%.*]] = load i16, ptr [[__W09_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT9_I_I:%.*]] = insertelement <16 x i16> [[VECINIT8_I_I]], i16 [[TMP41]], i32 9
+// X86-NEXT:    [[TMP42:%.*]] = load i16, ptr [[__W10_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT10_I_I:%.*]] = insertelement <16 x i16> [[VECINIT9_I_I]], i16 [[TMP42]], i32 10
+// X86-NEXT:    [[TMP43:%.*]] = load i16, ptr [[__W11_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT11_I_I:%.*]] = insertelement <16 x i16> [[VECINIT10_I_I]], i16 [[TMP43]], i32 11
+// X86-NEXT:    [[TMP44:%.*]] = load i16, ptr [[__W12_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT12_I_I:%.*]] = insertelement <16 x i16> [[VECINIT11_I_I]], i16 [[TMP44]], i32 12
+// X86-NEXT:    [[TMP45:%.*]] = load i16, ptr [[__W13_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT13_I_I:%.*]] = insertelement <16 x i16> [[VECINIT12_I_I]], i16 [[TMP45]], i32 13
+// X86-NEXT:    [[TMP46:%.*]] = load i16, ptr [[__W14_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT14_I_I:%.*]] = insertelement <16 x i16> [[VECINIT13_I_I]], i16 [[TMP46]], i32 14
+// X86-NEXT:    [[TMP47:%.*]] = load i16, ptr [[__W15_ADDR_I_I]], align 2, !noalias [[META277]]
+// X86-NEXT:    [[VECINIT15_I_I:%.*]] = insertelement <16 x i16> [[VECINIT14_I_I]], i16 [[TMP47]], i32 15
+// X86-NEXT:    store <16 x i16> [[VECINIT15_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META277]]
+// X86-NEXT:    [[TMP48:%.*]] = load <16 x i16>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META277]]
+// X86-NEXT:    [[TMP49:%.*]] = bitcast <16 x i16> [[TMP48]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP49]], ptr [[TMP_I]], align 32, !alias.scope [[META274]], !noalias [[META271]]
+// X86-NEXT:    [[TMP50:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META274]], !noalias [[META271]]
+// X86-NEXT:    store <4 x i64> [[TMP50]], ptr [[TMP_I]], align 32, !alias.scope [[META274]], !noalias [[META271]]
+// X86-NEXT:    [[TMP51:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META271]]
+// X86-NEXT:    store <4 x i64> [[TMP51]], ptr [[TMP]], align 32, !alias.scope [[META271]]
+// X86-NEXT:    [[TMP52:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META271]]
+// X86-NEXT:    store <4 x i64> [[TMP52]], ptr [[TMP]], align 32, !alias.scope [[META271]]
+// X86-NEXT:    [[TMP53:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP53]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP54:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP54]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_setr_epi16(short A0, short A1, short A2, short A3, short A4, short A5, short A6, short A7,
                               short A8, short A9, short A10, short A11, short A12, short A13, short A14, short A15) {
-  // CHECK-LABEL: test_mm256_setr_epi16
-  // CHECK: insertelement <16 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
   return _mm256_setr_epi16(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_epi32(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i32 noundef [[A0:%.*]], i32 noundef [[A1:%.*]], i32 noundef [[A2:%.*]], i32 noundef [[A3:%.*]], i32 noundef [[A4:%.*]], i32 noundef [[A5:%.*]], i32 noundef [[A6:%.*]], i32 noundef [[A7:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__I0_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I3_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I4_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I5_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I6_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I7_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i32>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__I0_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I3_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I4_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I5_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I6_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I7_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A4_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A5_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A6_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A7_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i32 [[A0]], ptr [[A0_ADDR]], align 4
+// X86-NEXT:    store i32 [[A1]], ptr [[A1_ADDR]], align 4
+// X86-NEXT:    store i32 [[A2]], ptr [[A2_ADDR]], align 4
+// X86-NEXT:    store i32 [[A3]], ptr [[A3_ADDR]], align 4
+// X86-NEXT:    store i32 [[A4]], ptr [[A4_ADDR]], align 4
+// X86-NEXT:    store i32 [[A5]], ptr [[A5_ADDR]], align 4
+// X86-NEXT:    store i32 [[A6]], ptr [[A6_ADDR]], align 4
+// X86-NEXT:    store i32 [[A7]], ptr [[A7_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A0_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A1_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A2_ADDR]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A3_ADDR]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A4_ADDR]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A5_ADDR]], align 4
+// X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A6_ADDR]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A7_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META278:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP0]], ptr [[__I0_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__I1_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP2]], ptr [[__I2_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP3]], ptr [[__I3_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP4]], ptr [[__I4_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP5]], ptr [[__I5_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP6]], ptr [[__I6_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    store i32 [[TMP7]], ptr [[__I7_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP8:%.*]] = load i32, ptr [[__I7_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP9:%.*]] = load i32, ptr [[__I6_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[__I5_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[__I4_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP12:%.*]] = load i32, ptr [[__I3_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP13:%.*]] = load i32, ptr [[__I2_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP14:%.*]] = load i32, ptr [[__I1_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    [[TMP15:%.*]] = load i32, ptr [[__I0_ADDR_I]], align 4, !noalias [[META278]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META281:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META284:![0-9]+]]
+// X86-NEXT:    store i32 [[TMP8]], ptr [[__I0_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    store i32 [[TMP9]], ptr [[__I1_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    store i32 [[TMP10]], ptr [[__I2_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    store i32 [[TMP11]], ptr [[__I3_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    store i32 [[TMP12]], ptr [[__I4_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    store i32 [[TMP13]], ptr [[__I5_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    store i32 [[TMP14]], ptr [[__I6_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    store i32 [[TMP15]], ptr [[__I7_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[TMP16:%.*]] = load i32, ptr [[__I7_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i32> poison, i32 [[TMP16]], i32 0
+// X86-NEXT:    [[TMP17:%.*]] = load i32, ptr [[__I6_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i32> [[VECINIT_I_I]], i32 [[TMP17]], i32 1
+// X86-NEXT:    [[TMP18:%.*]] = load i32, ptr [[__I5_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i32> [[VECINIT1_I_I]], i32 [[TMP18]], i32 2
+// X86-NEXT:    [[TMP19:%.*]] = load i32, ptr [[__I4_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i32> [[VECINIT2_I_I]], i32 [[TMP19]], i32 3
+// X86-NEXT:    [[TMP20:%.*]] = load i32, ptr [[__I3_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i32> [[VECINIT3_I_I]], i32 [[TMP20]], i32 4
+// X86-NEXT:    [[TMP21:%.*]] = load i32, ptr [[__I2_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i32> [[VECINIT4_I_I]], i32 [[TMP21]], i32 5
+// X86-NEXT:    [[TMP22:%.*]] = load i32, ptr [[__I1_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i32> [[VECINIT5_I_I]], i32 [[TMP22]], i32 6
+// X86-NEXT:    [[TMP23:%.*]] = load i32, ptr [[__I0_ADDR_I_I]], align 4, !noalias [[META284]]
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i32> [[VECINIT6_I_I]], i32 [[TMP23]], i32 7
+// X86-NEXT:    store <8 x i32> [[VECINIT7_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META284]]
+// X86-NEXT:    [[TMP24:%.*]] = load <8 x i32>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META284]]
+// X86-NEXT:    [[TMP25:%.*]] = bitcast <8 x i32> [[TMP24]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP25]], ptr [[TMP_I]], align 32, !alias.scope [[META281]], !noalias [[META278]]
+// X86-NEXT:    [[TMP26:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META281]], !noalias [[META278]]
+// X86-NEXT:    store <4 x i64> [[TMP26]], ptr [[TMP_I]], align 32, !alias.scope [[META281]], !noalias [[META278]]
+// X86-NEXT:    [[TMP27:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META278]]
+// X86-NEXT:    store <4 x i64> [[TMP27]], ptr [[TMP]], align 32, !alias.scope [[META278]]
+// X86-NEXT:    [[TMP28:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META278]]
+// X86-NEXT:    store <4 x i64> [[TMP28]], ptr [[TMP]], align 32, !alias.scope [[META278]]
+// X86-NEXT:    [[TMP29:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP29]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP30:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP30]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_setr_epi32(int A0, int A1, int A2, int A3, int A4, int A5, int A6, int A7) {
-  // CHECK-LABEL: test_mm256_setr_epi32
-  // CHECK: insertelement <8 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
   return _mm256_setr_epi32(A0, A1, A2, A3, A4, A5, A6, A7);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_epi64x(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], i64 noundef [[A0:%.*]], i64 noundef [[A1:%.*]], i64 noundef [[A2:%.*]], i64 noundef [[A3:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__C_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__D_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store i64 [[A0]], ptr [[A0_ADDR]], align 8
+// X86-NEXT:    store i64 [[A1]], ptr [[A1_ADDR]], align 8
+// X86-NEXT:    store i64 [[A2]], ptr [[A2_ADDR]], align 8
+// X86-NEXT:    store i64 [[A3]], ptr [[A3_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A0_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[A1_ADDR]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[A2_ADDR]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[A3_ADDR]], align 8
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META285:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META285]]
+// X86-NEXT:    store i64 [[TMP0]], ptr [[__A_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    store i64 [[TMP1]], ptr [[__B_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    store i64 [[TMP2]], ptr [[__C_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    store i64 [[TMP3]], ptr [[__D_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__D_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    [[TMP5:%.*]] = load i64, ptr [[__C_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    [[TMP6:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    [[TMP7:%.*]] = load i64, ptr [[__A_ADDR_I]], align 8, !noalias [[META285]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META288:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META291:![0-9]+]]
+// X86-NEXT:    store i64 [[TMP4]], ptr [[__A_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    store i64 [[TMP5]], ptr [[__B_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    store i64 [[TMP6]], ptr [[__C_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    store i64 [[TMP7]], ptr [[__D_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    [[TMP8:%.*]] = load i64, ptr [[__D_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i32 0
+// X86-NEXT:    [[TMP9:%.*]] = load i64, ptr [[__C_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i64> [[VECINIT_I_I]], i64 [[TMP9]], i32 1
+// X86-NEXT:    [[TMP10:%.*]] = load i64, ptr [[__B_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i64> [[VECINIT1_I_I]], i64 [[TMP10]], i32 2
+// X86-NEXT:    [[TMP11:%.*]] = load i64, ptr [[__A_ADDR_I_I]], align 8, !noalias [[META291]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i64> [[VECINIT2_I_I]], i64 [[TMP11]], i32 3
+// X86-NEXT:    store <4 x i64> [[VECINIT3_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META291]]
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META291]]
+// X86-NEXT:    store <4 x i64> [[TMP12]], ptr [[TMP_I]], align 32, !alias.scope [[META288]], !noalias [[META285]]
+// X86-NEXT:    [[TMP13:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META288]], !noalias [[META285]]
+// X86-NEXT:    store <4 x i64> [[TMP13]], ptr [[TMP_I]], align 32, !alias.scope [[META288]], !noalias [[META285]]
+// X86-NEXT:    [[TMP14:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META285]]
+// X86-NEXT:    store <4 x i64> [[TMP14]], ptr [[TMP]], align 32, !alias.scope [[META285]]
+// X86-NEXT:    [[TMP15:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META285]]
+// X86-NEXT:    store <4 x i64> [[TMP15]], ptr [[TMP]], align 32, !alias.scope [[META285]]
+// X86-NEXT:    [[TMP16:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP16]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP17:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP17]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_setr_epi64x(long long A0, long long A1, long long A2, long long A3) {
-  // CHECK-LABEL: test_mm256_setr_epi64x
-  // CHECK: insertelement <4 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
-  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
   return _mm256_setr_epi64x(A0, A1, A2, A3);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_m128(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__LO_ADDR_I_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META292:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META292]]
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META292]]
+// X86-NEXT:    store <4 x float> [[TMP1]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META292]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META292]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META292]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META295:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META298:![0-9]+]]
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[__HI_ADDR_I_I]], align 16, !noalias [[META298]]
+// X86-NEXT:    store <4 x float> [[TMP3]], ptr [[__LO_ADDR_I_I]], align 16, !noalias [[META298]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[__LO_ADDR_I_I]], align 16, !noalias [[META298]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__HI_ADDR_I_I]], align 16, !noalias [[META298]]
+// X86-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I_I]], ptr [[TMP_I]], align 32, !alias.scope [[META295]], !noalias [[META292]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !alias.scope [[META295]], !noalias [[META292]]
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP_I]], align 32, !alias.scope [[META295]], !noalias [[META292]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !noalias [[META292]]
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META292]]
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META292]]
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META292]]
+// X86-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP10:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_setr_m128(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm256_setr_m128
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_setr_m128(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_m128d(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__LO_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META299:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META299]]
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META299]]
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META299]]
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META299]]
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META299]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META302:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META305:![0-9]+]]
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[__HI_ADDR_I_I]], align 16, !noalias [[META305]]
+// X86-NEXT:    store <2 x double> [[TMP3]], ptr [[__LO_ADDR_I_I]], align 16, !noalias [[META305]]
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__LO_ADDR_I_I]], align 16, !noalias [[META305]]
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__HI_ADDR_I_I]], align 16, !noalias [[META305]]
+// X86-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I_I]], ptr [[TMP_I]], align 32, !alias.scope [[META302]], !noalias [[META299]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !alias.scope [[META302]], !noalias [[META299]]
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP_I]], align 32, !alias.scope [[META302]], !noalias [[META299]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !noalias [[META299]]
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META299]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META299]]
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META299]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_setr_m128d(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm256_setr_m128d
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128d(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_m128i(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__HI_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__LO_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__LO_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__HI_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META306:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META306]]
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__LO_ADDR_I]], align 16, !noalias [[META306]]
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__HI_ADDR_I]], align 16, !noalias [[META306]]
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__HI_ADDR_I]], align 16, !noalias [[META306]]
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__LO_ADDR_I]], align 16, !noalias [[META306]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META309:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META312:![0-9]+]]
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[__HI_ADDR_I_I]], align 16, !noalias [[META312]]
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[__LO_ADDR_I_I]], align 16, !noalias [[META312]]
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__LO_ADDR_I_I]], align 16, !noalias [[META312]]
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[__HI_ADDR_I_I]], align 16, !noalias [[META312]]
+// X86-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x i64> [[SHUFFLE_I_I]], ptr [[TMP_I]], align 32, !alias.scope [[META309]], !noalias [[META306]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !alias.scope [[META309]], !noalias [[META306]]
+// X86-NEXT:    store <4 x i64> [[TMP6]], ptr [[TMP_I]], align 32, !alias.scope [[META309]], !noalias [[META306]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x i64>, ptr [[TMP_I]], align 32, !noalias [[META306]]
+// X86-NEXT:    store <4 x i64> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META306]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META306]]
+// X86-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP]], align 32, !alias.scope [[META306]]
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP10]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_setr_m128i(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm256_setr_m128i
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_setr_m128i(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], double noundef [[A0:%.*]], double noundef [[A1:%.*]], double noundef [[A2:%.*]], double noundef [[A3:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__C_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__D_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store double [[A0]], ptr [[A0_ADDR]], align 8
+// X86-NEXT:    store double [[A1]], ptr [[A1_ADDR]], align 8
+// X86-NEXT:    store double [[A2]], ptr [[A2_ADDR]], align 8
+// X86-NEXT:    store double [[A3]], ptr [[A3_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A0_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[A1_ADDR]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[A2_ADDR]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[A3_ADDR]], align 8
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META313:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META313]]
+// X86-NEXT:    store double [[TMP0]], ptr [[__A_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    store double [[TMP1]], ptr [[__B_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    store double [[TMP2]], ptr [[__C_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    store double [[TMP3]], ptr [[__D_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    [[TMP4:%.*]] = load double, ptr [[__D_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    [[TMP5:%.*]] = load double, ptr [[__C_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    [[TMP6:%.*]] = load double, ptr [[__B_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    [[TMP7:%.*]] = load double, ptr [[__A_ADDR_I]], align 8, !noalias [[META313]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META316:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META319:![0-9]+]]
+// X86-NEXT:    store double [[TMP4]], ptr [[__A_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    store double [[TMP5]], ptr [[__B_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    store double [[TMP6]], ptr [[__C_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    store double [[TMP7]], ptr [[__D_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    [[TMP8:%.*]] = load double, ptr [[__D_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x double> poison, double [[TMP8]], i32 0
+// X86-NEXT:    [[TMP9:%.*]] = load double, ptr [[__C_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x double> [[VECINIT_I_I]], double [[TMP9]], i32 1
+// X86-NEXT:    [[TMP10:%.*]] = load double, ptr [[__B_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x double> [[VECINIT1_I_I]], double [[TMP10]], i32 2
+// X86-NEXT:    [[TMP11:%.*]] = load double, ptr [[__A_ADDR_I_I]], align 8, !noalias [[META319]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x double> [[VECINIT2_I_I]], double [[TMP11]], i32 3
+// X86-NEXT:    store <4 x double> [[VECINIT3_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META319]]
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META319]]
+// X86-NEXT:    store <4 x double> [[TMP12]], ptr [[TMP_I]], align 32, !alias.scope [[META316]], !noalias [[META313]]
+// X86-NEXT:    [[TMP13:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !alias.scope [[META316]], !noalias [[META313]]
+// X86-NEXT:    store <4 x double> [[TMP13]], ptr [[TMP_I]], align 32, !alias.scope [[META316]], !noalias [[META313]]
+// X86-NEXT:    [[TMP14:%.*]] = load <4 x double>, ptr [[TMP_I]], align 32, !noalias [[META313]]
+// X86-NEXT:    store <4 x double> [[TMP14]], ptr [[TMP]], align 32, !alias.scope [[META313]]
+// X86-NEXT:    [[TMP15:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META313]]
+// X86-NEXT:    store <4 x double> [[TMP15]], ptr [[TMP]], align 32, !alias.scope [[META313]]
+// X86-NEXT:    [[TMP16:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP16]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP17:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP17]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_setr_pd(double A0, double A1, double A2, double A3) {
-  // CHECK-LABEL: test_mm256_setr_pd
-  // CHECK: insertelement <4 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
-  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
   return _mm256_setr_pd(A0, A1, A2, A3);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setr_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], float noundef [[A0:%.*]], float noundef [[A1:%.*]], float noundef [[A2:%.*]], float noundef [[A3:%.*]], float noundef [[A4:%.*]], float noundef [[A5:%.*]], float noundef [[A6:%.*]], float noundef [[A7:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__C_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__D_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__E_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__F_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__G_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__H_ADDR_I_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__C_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__D_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__E_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__F_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__G_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[__H_ADDR_I:%.*]] = alloca float, align 4
+// X86-NEXT:    [[TMP_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A0_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A1_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A2_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A3_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A4_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A5_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A6_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[A7_ADDR:%.*]] = alloca float, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store float [[A0]], ptr [[A0_ADDR]], align 4
+// X86-NEXT:    store float [[A1]], ptr [[A1_ADDR]], align 4
+// X86-NEXT:    store float [[A2]], ptr [[A2_ADDR]], align 4
+// X86-NEXT:    store float [[A3]], ptr [[A3_ADDR]], align 4
+// X86-NEXT:    store float [[A4]], ptr [[A4_ADDR]], align 4
+// X86-NEXT:    store float [[A5]], ptr [[A5_ADDR]], align 4
+// X86-NEXT:    store float [[A6]], ptr [[A6_ADDR]], align 4
+// X86-NEXT:    store float [[A7]], ptr [[A7_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load float, ptr [[A0_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load float, ptr [[A1_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load float, ptr [[A2_ADDR]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load float, ptr [[A3_ADDR]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load float, ptr [[A4_ADDR]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load float, ptr [[A5_ADDR]], align 4
+// X86-NEXT:    [[TMP6:%.*]] = load float, ptr [[A6_ADDR]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load float, ptr [[A7_ADDR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META320:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP0]], ptr [[__A_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP1]], ptr [[__B_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP2]], ptr [[__C_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP3]], ptr [[__D_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP4]], ptr [[__E_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP5]], ptr [[__F_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP6]], ptr [[__G_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    store float [[TMP7]], ptr [[__H_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP8:%.*]] = load float, ptr [[__H_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP9:%.*]] = load float, ptr [[__G_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP10:%.*]] = load float, ptr [[__F_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP11:%.*]] = load float, ptr [[__E_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP12:%.*]] = load float, ptr [[__D_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP13:%.*]] = load float, ptr [[__C_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP14:%.*]] = load float, ptr [[__B_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    [[TMP15:%.*]] = load float, ptr [[__A_ADDR_I]], align 4, !noalias [[META320]]
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META323:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP_I]], ptr [[RESULT_PTR_I_I]], align 4, !noalias [[META326:![0-9]+]]
+// X86-NEXT:    store float [[TMP8]], ptr [[__A_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    store float [[TMP9]], ptr [[__B_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    store float [[TMP10]], ptr [[__C_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    store float [[TMP11]], ptr [[__D_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    store float [[TMP12]], ptr [[__E_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    store float [[TMP13]], ptr [[__F_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    store float [[TMP14]], ptr [[__G_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    store float [[TMP15]], ptr [[__H_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[TMP16:%.*]] = load float, ptr [[__H_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x float> poison, float [[TMP16]], i32 0
+// X86-NEXT:    [[TMP17:%.*]] = load float, ptr [[__G_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x float> [[VECINIT_I_I]], float [[TMP17]], i32 1
+// X86-NEXT:    [[TMP18:%.*]] = load float, ptr [[__F_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x float> [[VECINIT1_I_I]], float [[TMP18]], i32 2
+// X86-NEXT:    [[TMP19:%.*]] = load float, ptr [[__E_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x float> [[VECINIT2_I_I]], float [[TMP19]], i32 3
+// X86-NEXT:    [[TMP20:%.*]] = load float, ptr [[__D_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x float> [[VECINIT3_I_I]], float [[TMP20]], i32 4
+// X86-NEXT:    [[TMP21:%.*]] = load float, ptr [[__C_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x float> [[VECINIT4_I_I]], float [[TMP21]], i32 5
+// X86-NEXT:    [[TMP22:%.*]] = load float, ptr [[__B_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x float> [[VECINIT5_I_I]], float [[TMP22]], i32 6
+// X86-NEXT:    [[TMP23:%.*]] = load float, ptr [[__A_ADDR_I_I]], align 4, !noalias [[META326]]
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x float> [[VECINIT6_I_I]], float [[TMP23]], i32 7
+// X86-NEXT:    store <8 x float> [[VECINIT7_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META326]]
+// X86-NEXT:    [[TMP24:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32, !noalias [[META326]]
+// X86-NEXT:    store <8 x float> [[TMP24]], ptr [[TMP_I]], align 32, !alias.scope [[META323]], !noalias [[META320]]
+// X86-NEXT:    [[TMP25:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !alias.scope [[META323]], !noalias [[META320]]
+// X86-NEXT:    store <8 x float> [[TMP25]], ptr [[TMP_I]], align 32, !alias.scope [[META323]], !noalias [[META320]]
+// X86-NEXT:    [[TMP26:%.*]] = load <8 x float>, ptr [[TMP_I]], align 32, !noalias [[META320]]
+// X86-NEXT:    store <8 x float> [[TMP26]], ptr [[TMP]], align 32, !alias.scope [[META320]]
+// X86-NEXT:    [[TMP27:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META320]]
+// X86-NEXT:    store <8 x float> [[TMP27]], ptr [[TMP]], align 32, !alias.scope [[META320]]
+// X86-NEXT:    [[TMP28:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP28]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP29:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP29]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_setr_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
-  // CHECK-LABEL: test_mm256_setr_ps
-  // CHECK: insertelement <8 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
-  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
   return _mm256_setr_ps(A0, A1, A2, A3, A4, A5, A6, A7);
 }
 
+//
+// X86-LABEL: define void @test_mm256_setzero_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META327:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META327]]
+// X86-NEXT:    store <4 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META327]]
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META327]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[TMP]], align 32, !alias.scope [[META327]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META327]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[TMP]], align 32, !alias.scope [[META327]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_setzero_pd(void) {
-  // CHECK-LABEL: test_mm256_setzero_pd
-  // CHECK: store <4 x double> zeroinitializer
   return _mm256_setzero_pd();
 }
 
+//
+// X86-LABEL: define void @test_mm256_setzero_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META330:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META330]]
+// X86-NEXT:    store <8 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META330]]
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META330]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[TMP]], align 32, !alias.scope [[META330]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META330]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[TMP]], align 32, !alias.scope [[META330]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_setzero_ps(void) {
-  // CHECK-LABEL: test_mm256_setzero_ps
-  // CHECK: store <8 x float> zeroinitializer
   return _mm256_setzero_ps();
 }
 
+//
+// X86-LABEL: define void @test_mm256_setzero_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META333:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META333]]
+// X86-NEXT:    store <4 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META333]]
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32, !noalias [[META333]]
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[TMP]], align 32, !alias.scope [[META333]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META333]]
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[TMP]], align 32, !alias.scope [[META333]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_setzero_si256(void) {
-  // CHECK-LABEL: test_mm256_setzero_si256
-  // CHECK: store <4 x i64> zeroinitializer
   return _mm256_setzero_si256();
 }
 
+//
+// X86-LABEL: define void @test_mm256_shuffle_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[SHUFP:%.*]] = shufflevector <4 x double> [[TMP0]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// X86-NEXT:    store <4 x double> [[SHUFP]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_shuffle_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   return _mm256_shuffle_pd(A, B, 0);
 }
 
+//
+// X86-LABEL: define void @test_mm256_shuffle_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[SHUFP:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+// X86-NEXT:    store <8 x float> [[SHUFP]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_shuffle_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   return _mm256_shuffle_ps(A, B, 0);
 }
 
+//
+// X86-LABEL: define void @test_mm256_sqrt_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META336:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META336]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META336]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META336]]
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]])
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META336]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META336]]
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META336]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_sqrt_pd(__m256d A) {
-  // CHECK-LABEL: test_mm256_sqrt_pd
-  // CHECK: call <4 x double> @llvm.sqrt.v4f64(<4 x double> %{{.*}})
   return _mm256_sqrt_pd(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_sqrt_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META339:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META339]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META339]]
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META339]]
+// X86-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]])
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META339]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META339]]
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META339]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_sqrt_ps(__m256 A) {
-  // CHECK-LABEL: test_mm256_sqrt_ps
-  // CHECK: call <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.*}})
   return _mm256_sqrt_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_store_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP3]], align 32
+// X86-NEXT:    ret void
+//
 void test_mm256_store_pd(double* A, __m256d B) {
-  // CHECK-LABEL: test_mm256_store_pd
-  // CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32
   _mm256_store_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_store_ps(
+// X86-SAME: ptr noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP3]], align 32
+// X86-NEXT:    ret void
+//
 void test_mm256_store_ps(float* A, __m256 B) {
-  // CHECK-LABEL: test_mm256_store_ps
-  // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32
   _mm256_store_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_store_si256(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP3]], align 32
+// X86-NEXT:    ret void
+//
 void test_mm256_store_si256(__m256i* A, __m256i B) {
-  // CHECK-LABEL: test_mm256_store_si256
-  // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32
   _mm256_store_si256(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_storeu_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm256_storeu_pd(double* A, __m256d B) {
-  // CHECK-LABEL: test_mm256_storeu_pd
-  // CHECK:   store <4 x double> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm256_storeu_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_storeu_ps(
+// X86-SAME: ptr noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm256_storeu_ps(float* A, __m256 B) {
-  // CHECK-LABEL: test_mm256_storeu_ps
-  // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   // CHECk-NEXT: ret void
   _mm256_storeu_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_storeu_si256(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm256_storeu_si256(__m256i* A, __m256i B) {
-  // CHECK-LABEL: test_mm256_storeu_si256
-  // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   // CHECk-NEXT: ret void
   _mm256_storeu_si256(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_storeu2_m128(
+// X86-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I2:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I3:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RETVAL_I_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__ADDR_HI_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_LO_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__V128_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <8 x float> [[C]], ptr [[C_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__ADDR_HI_ADDR_I]], align 4
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__ADDR_LO_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x float> [[SHUFFLE_I_I]], ptr [[RETVAL_I_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[COERCE_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP7]], ptr [[__V128_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__ADDR_LO_ADDR_I]], align 4
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[__V128_I]], align 16
+// X86-NEXT:    store ptr [[TMP8]], ptr [[__P_ADDR_I2]], align 4
+// X86-NEXT:    store <4 x float> [[TMP9]], ptr [[__A_ADDR_I3]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x float>, ptr [[__A_ADDR_I3]], align 16
+// X86-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__P_ADDR_I2]], align 4
+// X86-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP11]], align 1
+// X86-NEXT:    [[TMP12:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    store <4 x float> [[EXTRACT_I]], ptr [[__V128_I]], align 16
+// X86-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[__ADDR_HI_ADDR_I]], align 4
+// X86-NEXT:    [[TMP14:%.*]] = load <4 x float>, ptr [[__V128_I]], align 16
+// X86-NEXT:    store ptr [[TMP13]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x float> [[TMP14]], ptr [[__A_ADDR_I1]], align 16
+// X86-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[__A_ADDR_I1]], align 16
+// X86-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <4 x float> [[TMP15]], ptr [[TMP16]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm256_storeu2_m128(float* A, float* B, __m256 C) {
-  // CHECK-LABEL: test_mm256_storeu2_m128
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128(A, B, C);
 }
 
+//
+// X86-LABEL: define void @test_mm256_storeu2_m128d(
+// X86-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], <4 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I2:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I3:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL_I_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__ADDR_HI_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_LO_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__V128_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <4 x double> [[C]], ptr [[C_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[C_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__ADDR_HI_ADDR_I]], align 4
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__ADDR_LO_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP5]], <2 x i32> <i32 0, i32 1>
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I_I]], ptr [[RETVAL_I_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[COERCE_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[__V128_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__ADDR_LO_ADDR_I]], align 4
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[__V128_I]], align 16
+// X86-NEXT:    store ptr [[TMP8]], ptr [[__DP_ADDR_I2]], align 4
+// X86-NEXT:    store <2 x double> [[TMP9]], ptr [[__A_ADDR_I3]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x double>, ptr [[__A_ADDR_I3]], align 16
+// X86-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[__DP_ADDR_I2]], align 4
+// X86-NEXT:    store <2 x double> [[TMP10]], ptr [[TMP11]], align 1
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+// X86-NEXT:    store <2 x double> [[EXTRACT_I]], ptr [[__V128_I]], align 16
+// X86-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[__ADDR_HI_ADDR_I]], align 4
+// X86-NEXT:    [[TMP14:%.*]] = load <2 x double>, ptr [[__V128_I]], align 16
+// X86-NEXT:    store ptr [[TMP13]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP14]], ptr [[__A_ADDR_I1]], align 16
+// X86-NEXT:    [[TMP15:%.*]] = load <2 x double>, ptr [[__A_ADDR_I1]], align 16
+// X86-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP15]], ptr [[TMP16]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm256_storeu2_m128d(double* A, double* B, __m256d C) {
-  // CHECK-LABEL: test_mm256_storeu2_m128d
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
-  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> <i32 2, i32 3>
-  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128d(A, B, C);
 }
 
+//
+// X86-LABEL: define void @test_mm256_storeu2_m128i(
+// X86-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], <4 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I1:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__ADDR_HI_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__ADDR_LO_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__V128_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[C]], ptr [[C_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[C_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__ADDR_HI_ADDR_I]], align 4
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__ADDR_LO_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I_I]], align 32
+// X86-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <2 x i32> <i32 0, i32 1>
+// X86-NEXT:    store <2 x i64> [[SHUFFLE_I_I]], ptr [[__V128_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[__ADDR_LO_ADDR_I]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[__V128_I]], align 16
+// X86-NEXT:    store ptr [[TMP6]], ptr [[__P_ADDR_I1]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[__B_ADDR_I2]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I2]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__P_ADDR_I1]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP8]], ptr [[TMP9]], align 1
+// X86-NEXT:    [[TMP10:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP11:%.*]] = bitcast <4 x i64> [[TMP10]] to <8 x i32>
+// X86-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i32> [[TMP11]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[EXTRACT_I]] to <2 x i64>
+// X86-NEXT:    store <2 x i64> [[TMP12]], ptr [[__V128_I]], align 16
+// X86-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[__ADDR_HI_ADDR_I]], align 4
+// X86-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[__V128_I]], align 16
+// X86-NEXT:    store ptr [[TMP13]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP14]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP15:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP15]], ptr [[TMP16]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm256_storeu2_m128i(__m128i* A, __m128i* B, __m256i C) {
-  // CHECK-LABEL: test_mm256_storeu2_m128i
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
-  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm256_storeu2_m128i(A, B, C);
 }
 
+//
+// X86-LABEL: define void @test_mm256_stream_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP3]], align 32, !nontemporal [[META342:![0-9]+]]
+// X86-NEXT:    ret void
+//
 void test_mm256_stream_pd(double* A, __m256d B) {
-  // CHECK-LABEL: test_mm256_stream_pd
-  // CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_stream_pd_void(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[TMP3]], align 32, !nontemporal [[META342]]
+// X86-NEXT:    ret void
+//
 void test_mm256_stream_pd_void(void *A, __m256d B) {
-  // CHECK-LABEL: test_mm256_stream_pd_void
-  // CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_stream_ps(
+// X86-SAME: ptr noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP3]], align 32, !nontemporal [[META342]]
+// X86-NEXT:    ret void
+//
 void test_mm256_stream_ps(float* A, __m256 B) {
-  // CHECK-LABEL: test_mm256_stream_ps
-  // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_stream_ps_void(
+// X86-SAME: ptr noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP3]], align 32, !nontemporal [[META342]]
+// X86-NEXT:    ret void
+//
 void test_mm256_stream_ps_void(void *A, __m256 B) {
-  // CHECK-LABEL: test_mm256_stream_ps_void
-  // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_stream_si256(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP3]], align 32, !nontemporal [[META342]]
+// X86-NEXT:    ret void
+//
 void test_mm256_stream_si256(__m256i* A, __m256i B) {
-  // CHECK-LABEL: test_mm256_stream_si256
-  // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_si256(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_stream_si256_void(
+// X86-SAME: ptr noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP3]], align 32, !nontemporal [[META342]]
+// X86-NEXT:    ret void
+//
 void test_mm256_stream_si256_void(void *A, __m256i B) {
-  // CHECK-LABEL: test_mm256_stream_si256_void
-  // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
   _mm256_stream_si256(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_sub_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META343:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META343]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META343]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META343]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META343]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META343]]
+// X86-NEXT:    [[SUB_I:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <4 x double> [[SUB_I]], ptr [[TMP]], align 32, !alias.scope [[META343]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META343]]
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META343]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_sub_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_sub_pd
-  // CHECK: fsub <4 x double>
   return _mm256_sub_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_sub_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META346:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META346]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META346]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META346]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META346]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META346]]
+// X86-NEXT:    [[SUB_I:%.*]] = fsub <8 x float> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <8 x float> [[SUB_I]], ptr [[TMP]], align 32, !alias.scope [[META346]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META346]]
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META346]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_sub_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_sub_ps
-  // CHECK: fsub <8 x float>
   return _mm256_sub_ps(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_testc_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_testc_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_testc_pd
-  // CHECK: call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_testc_pd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testc_pd(
+// X86-SAME: <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testc_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_testc_pd
-  // CHECK: call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_testc_pd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_testc_ps(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> [[TMP2]], <4 x float> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_testc_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_testc_ps
-  // CHECK: call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_testc_ps(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testc_ps(
+// X86-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testc_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_testc_ps
-  // CHECK: call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_testc_ps(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testc_si256(
+// X86-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> [[TMP2]], <4 x i64> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testc_si256(__m256i A, __m256i B) {
-  // CHECK-LABEL: test_mm256_testc_si256
-  // CHECK: call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_testc_si256(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_testnzc_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_testnzc_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_testnzc_pd
-  // CHECK: call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_testnzc_pd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testnzc_pd(
+// X86-SAME: <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testnzc_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_testnzc_pd
-  // CHECK: call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_testnzc_pd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_testnzc_ps(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> [[TMP2]], <4 x float> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_testnzc_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_testnzc_ps
-  // CHECK: call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_testnzc_ps(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testnzc_ps(
+// X86-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testnzc_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_testnzc_ps
-  // CHECK: call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_testnzc_ps(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testnzc_si256(
+// X86-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> [[TMP2]], <4 x i64> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testnzc_si256(__m256i A, __m256i B) {
-  // CHECK-LABEL: test_mm256_testnzc_si256
-  // CHECK: call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_testnzc_si256(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_testz_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_testz_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_testz_pd
-  // CHECK: call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_testz_pd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testz_pd(
+// X86-SAME: <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> [[TMP2]], <4 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testz_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_testz_pd
-  // CHECK: call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
   return _mm256_testz_pd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_testz_ps(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> [[TMP2]], <4 x float> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_testz_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_testz_ps
-  // CHECK: call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_testz_ps(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testz_ps(
+// X86-SAME: <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> [[TMP2]], <8 x float> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testz_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_testz_ps
-  // CHECK: call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
   return _mm256_testz_ps(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_testz_si256(
+// X86-SAME: <4 x i64> noundef [[A:%.*]], <4 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[__B_ADDR_I]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> [[TMP2]], <4 x i64> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm256_testz_si256(__m256i A, __m256i B) {
-  // CHECK-LABEL: test_mm256_testz_si256
-  // CHECK: call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_testz_si256(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_undefined_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META349:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META349]]
+// X86-NEXT:    [[TMP0:%.*]] = freeze <4 x double> undef
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[TMP]], align 32, !alias.scope [[META349]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META349]]
+// X86-NEXT:    store <8 x float> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META349]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_undefined_ps(void) {
-  // X64-LABEL: test_mm256_undefined_ps
-  // X64: ret <8 x float> zeroinitializer
   //
-  // X86-LABEL: test_mm256_undefined_ps
-  // X86: store <8 x float> zeroinitializer
   return _mm256_undefined_ps();
 }
 
+//
+// X86-LABEL: define void @test_mm256_undefined_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META352:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META352]]
+// X86-NEXT:    [[TMP0:%.*]] = freeze <4 x double> undef
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[TMP]], align 32, !alias.scope [[META352]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META352]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[TMP]], align 32, !alias.scope [[META352]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP2]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_undefined_pd(void) {
-  // X64-LABEL: test_mm256_undefined_pd
-  // X64: ret <4 x double> zeroinitializer
   //
-  // X86-LABEL: test_mm256_undefined_pd
-  // X86: store <4 x double> zeroinitializer
   return _mm256_undefined_pd();
 }
 
+//
+// X86-LABEL: define void @test_mm256_undefined_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META355:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META355]]
+// X86-NEXT:    [[TMP0:%.*]] = freeze <4 x double> undef
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// X86-NEXT:    store <4 x i64> [[TMP1]], ptr [[TMP]], align 32, !alias.scope [[META355]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META355]]
+// X86-NEXT:    store <4 x i64> [[TMP2]], ptr [[TMP]], align 32, !alias.scope [[META355]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_undefined_si256(void) {
-  // X64-LABEL: test_mm256_undefined_si256
-  // X64: ret <4 x i64> zeroinitializer
   //
-  // X86-LABEL: test_mm256_undefined_si256
-  // X86: store <4 x i64> zeroinitializer
   return _mm256_undefined_si256();
 }
 
+//
+// X86-LABEL: define void @test_mm256_unpackhi_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META358:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META358]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META358]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META358]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META358]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META358]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META358]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META358]]
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META358]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_unpackhi_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_unpackhi_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   return _mm256_unpackhi_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_unpackhi_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META361:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META361]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META361]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META361]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META361]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META361]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META361]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META361]]
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META361]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_unpackhi_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_unpackhi_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   return _mm256_unpackhi_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_unpacklo_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META364:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META364]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META364]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META364]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META364]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META364]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META364]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META364]]
+// X86-NEXT:    store <4 x double> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META364]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_unpacklo_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_unpacklo_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   return _mm256_unpacklo_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_unpacklo_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META367:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META367]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META367]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META367]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META367]]
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META367]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> [[TMP3]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META367]]
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META367]]
+// X86-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP]], align 32, !alias.scope [[META367]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_unpacklo_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_unpacklo_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   return _mm256_unpacklo_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_xor_pd(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <4 x double> noundef [[A:%.*]], <4 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META370:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META370]]
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META370]]
+// X86-NEXT:    store <4 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META370]]
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32, !noalias [[META370]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x double> [[TMP2]] to <4 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x double>, ptr [[__B_ADDR_I]], align 32, !noalias [[META370]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x double> [[TMP4]] to <4 x i64>
+// X86-NEXT:    [[XOR_I:%.*]] = xor <4 x i64> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[XOR_I]] to <4 x double>
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META370]]
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META370]]
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META370]]
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_xor_pd(__m256d A, __m256d B) {
-  // CHECK-LABEL: test_mm256_xor_pd
-  // CHECK: xor <4 x i64>
   return _mm256_xor_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_xor_ps(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <8 x float> noundef [[A:%.*]], <8 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[B]], ptr [[B_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[B_ADDR]], align 32
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META373:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META373]]
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32, !noalias [[META373]]
+// X86-NEXT:    store <8 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 32, !noalias [[META373]]
+// X86-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32, !noalias [[META373]]
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <8 x float> [[TMP2]] to <8 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x float>, ptr [[__B_ADDR_I]], align 32, !noalias [[META373]]
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x float> [[TMP4]] to <8 x i32>
+// X86-NEXT:    [[XOR_I:%.*]] = xor <8 x i32> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[XOR_I]] to <8 x float>
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[TMP]], align 32, !alias.scope [[META373]]
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META373]]
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[TMP]], align 32, !alias.scope [[META373]]
+// X86-NEXT:    [[TMP8:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP8]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP9:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP9]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_xor_ps(__m256 A, __m256 B) {
-  // CHECK-LABEL: test_mm256_xor_ps
-  // CHECK: xor <8 x i32>
   return _mm256_xor_ps(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm256_zeroall(
+// X86-SAME: ) #[[ATTR3:[0-9]+]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    call void @llvm.x86.avx.vzeroall()
+// X86-NEXT:    ret void
+//
 void test_mm256_zeroall(void) {
-  // CHECK-LABEL: test_mm256_zeroall
-  // CHECK: call void @llvm.x86.avx.vzeroall()
   return _mm256_zeroall();
 }
 
+//
+// X86-LABEL: define void @test_mm256_zeroupper(
+// X86-SAME: ) #[[ATTR3]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    call void @llvm.x86.avx.vzeroupper()
+// X86-NEXT:    ret void
+//
 void test_mm256_zeroupper(void) {
-  // CHECK-LABEL: test_mm256_zeroupper
-  // CHECK: call void @llvm.x86.avx.vzeroupper()
   return _mm256_zeroupper();
 }
 
+//
+// X86-LABEL: define void @test_mm256_zextpd128_pd256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 [[AGG_RESULT:%.*]], <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META376:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META376]]
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META376]]
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16, !noalias [[META376]]
+// X86-NEXT:    store <2 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE_I]], align 16, !noalias [[META376]]
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE_I]], align 16, !noalias [[META376]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x double> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META376]]
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x double>, ptr [[TMP]], align 32, !alias.scope [[META376]]
+// X86-NEXT:    store <4 x double> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META376]]
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x double>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x double> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <4 x double>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x double> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256d test_mm256_zextpd128_pd256(__m128d A) {
-  // CHECK-LABEL: test_mm256_zextpd128_pd256
-  // CHECK: store <2 x double> zeroinitializer
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_zextpd128_pd256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_zextps128_ps256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 [[AGG_RESULT:%.*]], <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META379:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META379]]
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META379]]
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16, !noalias [[META379]]
+// X86-NEXT:    store <4 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE_I]], align 16, !noalias [[META379]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[COERCE_I]], align 16, !noalias [[META379]]
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    store <8 x float> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META379]]
+// X86-NEXT:    [[TMP5:%.*]] = load <8 x float>, ptr [[TMP]], align 32, !alias.scope [[META379]]
+// X86-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP]], align 32, !alias.scope [[META379]]
+// X86-NEXT:    [[TMP6:%.*]] = load <8 x float>, ptr [[TMP]], align 32
+// X86-NEXT:    store <8 x float> [[TMP6]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP7:%.*]] = load <8 x float>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <8 x float> [[TMP7]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256 test_mm256_zextps128_ps256(__m128 A) {
-  // CHECK-LABEL: test_mm256_zextps128_ps256
-  // CHECK: store <4 x float> zeroinitializer
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_zextps128_ps256(A);
 }
 
+//
+// X86-LABEL: define void @test_mm256_zextsi128_si256(
+// X86-SAME: ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 [[AGG_RESULT:%.*]], <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RESULT_PTR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META382:![0-9]+]])
+// X86-NEXT:    store ptr [[TMP]], ptr [[RESULT_PTR_I]], align 4, !noalias [[META382]]
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16, !noalias [[META382]]
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16, !noalias [[META382]]
+// X86-NEXT:    store <2 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// X86-NEXT:    store <4 x i64> [[SHUFFLE_I]], ptr [[TMP]], align 32, !alias.scope [[META382]]
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr [[TMP]], align 32, !alias.scope [[META382]]
+// X86-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP]], align 32, !alias.scope [[META382]]
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i64>, ptr [[TMP]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP4]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP5]], ptr [[AGG_RESULT]], align 32
+// X86-NEXT:    ret void
+//
 __m256i test_mm256_zextsi128_si256(__m128i A) {
-  // CHECK-LABEL: test_mm256_zextsi128_si256
-  // CHECK: store <2 x i64> zeroinitializer
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm256_zextsi128_si256(A);
 }
 
+//
+// X86-LABEL: define double @test_mm256_cvtsd_f64(
+// X86-SAME: <4 x double> noundef [[__A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    [[__A_ADDR:%.*]] = alloca <4 x double>, align 32
+// X86-NEXT:    store <4 x double> [[__A]], ptr [[__A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[__A_ADDR]], align 32
+// X86-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
+// X86-NEXT:    ret double [[VECEXT_I]]
+//
 double test_mm256_cvtsd_f64(__m256d __a)
 {
-  // CHECK-LABEL: test_mm256_cvtsd_f64
-  // CHECK: extractelement <4 x double> %{{.*}}, i32 0
   return _mm256_cvtsd_f64(__a);
 }
 
+//
+// X86-LABEL: define i32 @test_mm256_cvtsi256_si32(
+// X86-SAME: <4 x i64> noundef [[__A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    [[__B_I:%.*]] = alloca <8 x i32>, align 32
+// X86-NEXT:    [[__A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// X86-NEXT:    store <4 x i64> [[__A]], ptr [[__A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[__A_ADDR]], align 32
+// X86-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to <8 x i32>
+// X86-NEXT:    store <8 x i32> [[TMP2]], ptr [[__B_I]], align 32
+// X86-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr [[__B_I]], align 32
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+// X86-NEXT:    ret i32 [[VECEXT_I]]
+//
 int test_mm256_cvtsi256_si32(__m256i __a)
 {
-  // CHECK-LABEL: test_mm256_cvtsi256_si32
-  // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
   return _mm256_cvtsi256_si32(__a);
 }
 
+//
+// X86-LABEL: define float @test_mm256_cvtss_f32(
+// X86-SAME: <8 x float> noundef [[__A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x float>, align 32
+// X86-NEXT:    store <8 x float> [[__A]], ptr [[__A_ADDR]], align 32
+// X86-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[__A_ADDR]], align 32
+// X86-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
+// X86-NEXT:    ret float [[VECEXT_I]]
+//
 float test_mm256_cvtss_f32(__m256 __a)
 {
-  // CHECK-LABEL: test_mm256_cvtss_f32
-  // CHECK: extractelement <8 x float> %{{.*}}, i32 0
   return _mm256_cvtss_f32(__a);
 }
+//.
+// X86: [[META3]] = !{[[META4:![0-9]+]]}
+// X86: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"_mm256_add_pd: %agg.result"}
+// X86: [[META5]] = distinct !{[[META5]], !"_mm256_add_pd"}
+// X86: [[META6]] = !{[[META7:![0-9]+]]}
+// X86: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]], !"_mm256_add_ps: %agg.result"}
+// X86: [[META8]] = distinct !{[[META8]], !"_mm256_add_ps"}
+// X86: [[META9]] = !{[[META10:![0-9]+]]}
+// X86: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]], !"_mm256_addsub_pd: %agg.result"}
+// X86: [[META11]] = distinct !{[[META11]], !"_mm256_addsub_pd"}
+// X86: [[META12]] = !{[[META13:![0-9]+]]}
+// X86: [[META13]] = distinct !{[[META13]], [[META14:![0-9]+]], !"_mm256_addsub_ps: %agg.result"}
+// X86: [[META14]] = distinct !{[[META14]], !"_mm256_addsub_ps"}
+// X86: [[META15]] = !{[[META16:![0-9]+]]}
+// X86: [[META16]] = distinct !{[[META16]], [[META17:![0-9]+]], !"_mm256_and_pd: %agg.result"}
+// X86: [[META17]] = distinct !{[[META17]], !"_mm256_and_pd"}
+// X86: [[META18]] = !{[[META19:![0-9]+]]}
+// X86: [[META19]] = distinct !{[[META19]], [[META20:![0-9]+]], !"_mm256_and_ps: %agg.result"}
+// X86: [[META20]] = distinct !{[[META20]], !"_mm256_and_ps"}
+// X86: [[META21]] = !{[[META22:![0-9]+]]}
+// X86: [[META22]] = distinct !{[[META22]], [[META23:![0-9]+]], !"_mm256_andnot_pd: %agg.result"}
+// X86: [[META23]] = distinct !{[[META23]], !"_mm256_andnot_pd"}
+// X86: [[META24]] = !{[[META25:![0-9]+]]}
+// X86: [[META25]] = distinct !{[[META25]], [[META26:![0-9]+]], !"_mm256_andnot_ps: %agg.result"}
+// X86: [[META26]] = distinct !{[[META26]], !"_mm256_andnot_ps"}
+// X86: [[META27]] = !{[[META28:![0-9]+]]}
+// X86: [[META28]] = distinct !{[[META28]], [[META29:![0-9]+]], !"_mm256_blendv_pd: %agg.result"}
+// X86: [[META29]] = distinct !{[[META29]], !"_mm256_blendv_pd"}
+// X86: [[META30]] = !{[[META31:![0-9]+]]}
+// X86: [[META31]] = distinct !{[[META31]], [[META32:![0-9]+]], !"_mm256_blendv_ps: %agg.result"}
+// X86: [[META32]] = distinct !{[[META32]], !"_mm256_blendv_ps"}
+// X86: [[META33]] = !{[[META34:![0-9]+]]}
+// X86: [[META34]] = distinct !{[[META34]], [[META35:![0-9]+]], !"_mm256_broadcast_pd: %agg.result"}
+// X86: [[META35]] = distinct !{[[META35]], !"_mm256_broadcast_pd"}
+// X86: [[META36]] = !{[[META37:![0-9]+]]}
+// X86: [[META37]] = distinct !{[[META37]], [[META38:![0-9]+]], !"_mm256_broadcast_ps: %agg.result"}
+// X86: [[META38]] = distinct !{[[META38]], !"_mm256_broadcast_ps"}
+// X86: [[META39]] = !{[[META40:![0-9]+]]}
+// X86: [[META40]] = distinct !{[[META40]], [[META41:![0-9]+]], !"_mm256_broadcast_sd: %agg.result"}
+// X86: [[META41]] = distinct !{[[META41]], !"_mm256_broadcast_sd"}
+// X86: [[META42]] = !{[[META43:![0-9]+]]}
+// X86: [[META43]] = distinct !{[[META43]], [[META44:![0-9]+]], !"_mm256_broadcast_ss: %agg.result"}
+// X86: [[META44]] = distinct !{[[META44]], !"_mm256_broadcast_ss"}
+// X86: [[META45]] = !{[[META46:![0-9]+]]}
+// X86: [[META46]] = distinct !{[[META46]], [[META47:![0-9]+]], !"_mm256_castpd_ps: %agg.result"}
+// X86: [[META47]] = distinct !{[[META47]], !"_mm256_castpd_ps"}
+// X86: [[META48]] = !{[[META49:![0-9]+]]}
+// X86: [[META49]] = distinct !{[[META49]], [[META50:![0-9]+]], !"_mm256_castpd_si256: %agg.result"}
+// X86: [[META50]] = distinct !{[[META50]], !"_mm256_castpd_si256"}
+// X86: [[META51]] = !{[[META52:![0-9]+]]}
+// X86: [[META52]] = distinct !{[[META52]], [[META53:![0-9]+]], !"_mm256_castpd128_pd256: %agg.result"}
+// X86: [[META53]] = distinct !{[[META53]], !"_mm256_castpd128_pd256"}
+// X86: [[META54]] = !{[[META55:![0-9]+]]}
+// X86: [[META55]] = distinct !{[[META55]], [[META56:![0-9]+]], !"_mm256_castps_pd: %agg.result"}
+// X86: [[META56]] = distinct !{[[META56]], !"_mm256_castps_pd"}
+// X86: [[META57]] = !{[[META58:![0-9]+]]}
+// X86: [[META58]] = distinct !{[[META58]], [[META59:![0-9]+]], !"_mm256_castps_si256: %agg.result"}
+// X86: [[META59]] = distinct !{[[META59]], !"_mm256_castps_si256"}
+// X86: [[META60]] = !{[[META61:![0-9]+]]}
+// X86: [[META61]] = distinct !{[[META61]], [[META62:![0-9]+]], !"_mm256_castps128_ps256: %agg.result"}
+// X86: [[META62]] = distinct !{[[META62]], !"_mm256_castps128_ps256"}
+// X86: [[META63]] = !{[[META64:![0-9]+]]}
+// X86: [[META64]] = distinct !{[[META64]], [[META65:![0-9]+]], !"_mm256_castsi128_si256: %agg.result"}
+// X86: [[META65]] = distinct !{[[META65]], !"_mm256_castsi128_si256"}
+// X86: [[META66]] = !{[[META67:![0-9]+]]}
+// X86: [[META67]] = distinct !{[[META67]], [[META68:![0-9]+]], !"_mm256_castsi256_pd: %agg.result"}
+// X86: [[META68]] = distinct !{[[META68]], !"_mm256_castsi256_pd"}
+// X86: [[META69]] = !{[[META70:![0-9]+]]}
+// X86: [[META70]] = distinct !{[[META70]], [[META71:![0-9]+]], !"_mm256_castsi256_ps: %agg.result"}
+// X86: [[META71]] = distinct !{[[META71]], !"_mm256_castsi256_ps"}
+// X86: [[META72]] = !{[[META73:![0-9]+]]}
+// X86: [[META73]] = distinct !{[[META73]], [[META74:![0-9]+]], !"_mm256_cvtepi32_pd: %agg.result"}
+// X86: [[META74]] = distinct !{[[META74]], !"_mm256_cvtepi32_pd"}
+// X86: [[META75]] = !{[[META76:![0-9]+]]}
+// X86: [[META76]] = distinct !{[[META76]], [[META77:![0-9]+]], !"_mm256_cvtepi32_ps: %agg.result"}
+// X86: [[META77]] = distinct !{[[META77]], !"_mm256_cvtepi32_ps"}
+// X86: [[META78]] = !{[[META79:![0-9]+]]}
+// X86: [[META79]] = distinct !{[[META79]], [[META80:![0-9]+]], !"_mm256_cvtps_epi32: %agg.result"}
+// X86: [[META80]] = distinct !{[[META80]], !"_mm256_cvtps_epi32"}
+// X86: [[META81]] = !{[[META82:![0-9]+]]}
+// X86: [[META82]] = distinct !{[[META82]], [[META83:![0-9]+]], !"_mm256_cvtps_pd: %agg.result"}
+// X86: [[META83]] = distinct !{[[META83]], !"_mm256_cvtps_pd"}
+// X86: [[META84]] = !{[[META85:![0-9]+]]}
+// X86: [[META85]] = distinct !{[[META85]], [[META86:![0-9]+]], !"_mm256_cvttps_epi32: %agg.result"}
+// X86: [[META86]] = distinct !{[[META86]], !"_mm256_cvttps_epi32"}
+// X86: [[META87]] = !{[[META88:![0-9]+]]}
+// X86: [[META88]] = distinct !{[[META88]], [[META89:![0-9]+]], !"_mm256_div_pd: %agg.result"}
+// X86: [[META89]] = distinct !{[[META89]], !"_mm256_div_pd"}
+// X86: [[META90]] = !{[[META91:![0-9]+]]}
+// X86: [[META91]] = distinct !{[[META91]], [[META92:![0-9]+]], !"_mm256_div_ps: %agg.result"}
+// X86: [[META92]] = distinct !{[[META92]], !"_mm256_div_ps"}
+// X86: [[META93]] = !{[[META94:![0-9]+]]}
+// X86: [[META94]] = distinct !{[[META94]], [[META95:![0-9]+]], !"_mm256_hadd_pd: %agg.result"}
+// X86: [[META95]] = distinct !{[[META95]], !"_mm256_hadd_pd"}
+// X86: [[META96]] = !{[[META97:![0-9]+]]}
+// X86: [[META97]] = distinct !{[[META97]], [[META98:![0-9]+]], !"_mm256_hadd_ps: %agg.result"}
+// X86: [[META98]] = distinct !{[[META98]], !"_mm256_hadd_ps"}
+// X86: [[META99]] = !{[[META100:![0-9]+]]}
+// X86: [[META100]] = distinct !{[[META100]], [[META101:![0-9]+]], !"_mm256_hsub_pd: %agg.result"}
+// X86: [[META101]] = distinct !{[[META101]], !"_mm256_hsub_pd"}
+// X86: [[META102]] = !{[[META103:![0-9]+]]}
+// X86: [[META103]] = distinct !{[[META103]], [[META104:![0-9]+]], !"_mm256_hsub_ps: %agg.result"}
+// X86: [[META104]] = distinct !{[[META104]], !"_mm256_hsub_ps"}
+// X86: [[META105]] = !{[[META106:![0-9]+]]}
+// X86: [[META106]] = distinct !{[[META106]], [[META107:![0-9]+]], !"_mm256_lddqu_si256: %agg.result"}
+// X86: [[META107]] = distinct !{[[META107]], !"_mm256_lddqu_si256"}
+// X86: [[META108]] = !{[[META109:![0-9]+]]}
+// X86: [[META109]] = distinct !{[[META109]], [[META110:![0-9]+]], !"_mm256_load_pd: %agg.result"}
+// X86: [[META110]] = distinct !{[[META110]], !"_mm256_load_pd"}
+// X86: [[META111]] = !{[[META112:![0-9]+]]}
+// X86: [[META112]] = distinct !{[[META112]], [[META113:![0-9]+]], !"_mm256_load_ps: %agg.result"}
+// X86: [[META113]] = distinct !{[[META113]], !"_mm256_load_ps"}
+// X86: [[META114]] = !{[[META115:![0-9]+]]}
+// X86: [[META115]] = distinct !{[[META115]], [[META116:![0-9]+]], !"_mm256_load_si256: %agg.result"}
+// X86: [[META116]] = distinct !{[[META116]], !"_mm256_load_si256"}
+// X86: [[META117]] = !{[[META118:![0-9]+]]}
+// X86: [[META118]] = distinct !{[[META118]], [[META119:![0-9]+]], !"_mm256_loadu_pd: %agg.result"}
+// X86: [[META119]] = distinct !{[[META119]], !"_mm256_loadu_pd"}
+// X86: [[META120]] = !{[[META121:![0-9]+]]}
+// X86: [[META121]] = distinct !{[[META121]], [[META122:![0-9]+]], !"_mm256_loadu_ps: %agg.result"}
+// X86: [[META122]] = distinct !{[[META122]], !"_mm256_loadu_ps"}
+// X86: [[META123]] = !{[[META124:![0-9]+]]}
+// X86: [[META124]] = distinct !{[[META124]], [[META125:![0-9]+]], !"_mm256_loadu_si256: %agg.result"}
+// X86: [[META125]] = distinct !{[[META125]], !"_mm256_loadu_si256"}
+// X86: [[META126]] = !{[[META127:![0-9]+]]}
+// X86: [[META127]] = distinct !{[[META127]], [[META128:![0-9]+]], !"_mm256_loadu2_m128: %agg.result"}
+// X86: [[META128]] = distinct !{[[META128]], !"_mm256_loadu2_m128"}
+// X86: [[META129]] = !{[[META130:![0-9]+]]}
+// X86: [[META130]] = distinct !{[[META130]], [[META131:![0-9]+]], !"_mm256_set_m128: %agg.result"}
+// X86: [[META131]] = distinct !{[[META131]], !"_mm256_set_m128"}
+// X86: [[META132]] = !{[[META133:![0-9]+]]}
+// X86: [[META133]] = distinct !{[[META133]], [[META134:![0-9]+]], !"_mm256_loadu2_m128d: %agg.result"}
+// X86: [[META134]] = distinct !{[[META134]], !"_mm256_loadu2_m128d"}
+// X86: [[META135]] = !{[[META136:![0-9]+]]}
+// X86: [[META136]] = distinct !{[[META136]], [[META137:![0-9]+]], !"_mm256_set_m128d: %agg.result"}
+// X86: [[META137]] = distinct !{[[META137]], !"_mm256_set_m128d"}
+// X86: [[META138]] = !{[[META139:![0-9]+]]}
+// X86: [[META139]] = distinct !{[[META139]], [[META140:![0-9]+]], !"_mm256_loadu2_m128i: %agg.result"}
+// X86: [[META140]] = distinct !{[[META140]], !"_mm256_loadu2_m128i"}
+// X86: [[META141]] = !{[[META142:![0-9]+]]}
+// X86: [[META142]] = distinct !{[[META142]], [[META143:![0-9]+]], !"_mm256_set_m128i: %agg.result"}
+// X86: [[META143]] = distinct !{[[META143]], !"_mm256_set_m128i"}
+// X86: [[META144]] = !{[[META145:![0-9]+]]}
+// X86: [[META145]] = distinct !{[[META145]], [[META146:![0-9]+]], !"_mm256_maskload_pd: %agg.result"}
+// X86: [[META146]] = distinct !{[[META146]], !"_mm256_maskload_pd"}
+// X86: [[META147]] = !{[[META148:![0-9]+]]}
+// X86: [[META148]] = distinct !{[[META148]], [[META149:![0-9]+]], !"_mm256_maskload_ps: %agg.result"}
+// X86: [[META149]] = distinct !{[[META149]], !"_mm256_maskload_ps"}
+// X86: [[META150]] = !{[[META151:![0-9]+]]}
+// X86: [[META151]] = distinct !{[[META151]], [[META152:![0-9]+]], !"_mm256_max_pd: %agg.result"}
+// X86: [[META152]] = distinct !{[[META152]], !"_mm256_max_pd"}
+// X86: [[META153]] = !{[[META154:![0-9]+]]}
+// X86: [[META154]] = distinct !{[[META154]], [[META155:![0-9]+]], !"_mm256_max_ps: %agg.result"}
+// X86: [[META155]] = distinct !{[[META155]], !"_mm256_max_ps"}
+// X86: [[META156]] = !{[[META157:![0-9]+]]}
+// X86: [[META157]] = distinct !{[[META157]], [[META158:![0-9]+]], !"_mm256_min_pd: %agg.result"}
+// X86: [[META158]] = distinct !{[[META158]], !"_mm256_min_pd"}
+// X86: [[META159]] = !{[[META160:![0-9]+]]}
+// X86: [[META160]] = distinct !{[[META160]], [[META161:![0-9]+]], !"_mm256_min_ps: %agg.result"}
+// X86: [[META161]] = distinct !{[[META161]], !"_mm256_min_ps"}
+// X86: [[META162]] = !{[[META163:![0-9]+]]}
+// X86: [[META163]] = distinct !{[[META163]], [[META164:![0-9]+]], !"_mm256_movedup_pd: %agg.result"}
+// X86: [[META164]] = distinct !{[[META164]], !"_mm256_movedup_pd"}
+// X86: [[META165]] = !{[[META166:![0-9]+]]}
+// X86: [[META166]] = distinct !{[[META166]], [[META167:![0-9]+]], !"_mm256_movehdup_ps: %agg.result"}
+// X86: [[META167]] = distinct !{[[META167]], !"_mm256_movehdup_ps"}
+// X86: [[META168]] = !{[[META169:![0-9]+]]}
+// X86: [[META169]] = distinct !{[[META169]], [[META170:![0-9]+]], !"_mm256_moveldup_ps: %agg.result"}
+// X86: [[META170]] = distinct !{[[META170]], !"_mm256_moveldup_ps"}
+// X86: [[META171]] = !{[[META172:![0-9]+]]}
+// X86: [[META172]] = distinct !{[[META172]], [[META173:![0-9]+]], !"_mm256_mul_pd: %agg.result"}
+// X86: [[META173]] = distinct !{[[META173]], !"_mm256_mul_pd"}
+// X86: [[META174]] = !{[[META175:![0-9]+]]}
+// X86: [[META175]] = distinct !{[[META175]], [[META176:![0-9]+]], !"_mm256_mul_ps: %agg.result"}
+// X86: [[META176]] = distinct !{[[META176]], !"_mm256_mul_ps"}
+// X86: [[META177]] = !{[[META178:![0-9]+]]}
+// X86: [[META178]] = distinct !{[[META178]], [[META179:![0-9]+]], !"_mm256_or_pd: %agg.result"}
+// X86: [[META179]] = distinct !{[[META179]], !"_mm256_or_pd"}
+// X86: [[META180]] = !{[[META181:![0-9]+]]}
+// X86: [[META181]] = distinct !{[[META181]], [[META182:![0-9]+]], !"_mm256_or_ps: %agg.result"}
+// X86: [[META182]] = distinct !{[[META182]], !"_mm256_or_ps"}
+// X86: [[META183]] = !{[[META184:![0-9]+]]}
+// X86: [[META184]] = distinct !{[[META184]], [[META185:![0-9]+]], !"_mm256_permutevar_pd: %agg.result"}
+// X86: [[META185]] = distinct !{[[META185]], !"_mm256_permutevar_pd"}
+// X86: [[META186]] = !{[[META187:![0-9]+]]}
+// X86: [[META187]] = distinct !{[[META187]], [[META188:![0-9]+]], !"_mm256_permutevar_ps: %agg.result"}
+// X86: [[META188]] = distinct !{[[META188]], !"_mm256_permutevar_ps"}
+// X86: [[META189]] = !{[[META190:![0-9]+]]}
+// X86: [[META190]] = distinct !{[[META190]], [[META191:![0-9]+]], !"_mm256_rcp_ps: %agg.result"}
+// X86: [[META191]] = distinct !{[[META191]], !"_mm256_rcp_ps"}
+// X86: [[META192]] = !{[[META193:![0-9]+]]}
+// X86: [[META193]] = distinct !{[[META193]], [[META194:![0-9]+]], !"_mm256_rsqrt_ps: %agg.result"}
+// X86: [[META194]] = distinct !{[[META194]], !"_mm256_rsqrt_ps"}
+// X86: [[META198]] = !{[[META199:![0-9]+]]}
+// X86: [[META199]] = distinct !{[[META199]], [[META200:![0-9]+]], !"_mm256_set_epi16: %agg.result"}
+// X86: [[META200]] = distinct !{[[META200]], !"_mm256_set_epi16"}
+// X86: [[META201]] = !{[[META202:![0-9]+]]}
+// X86: [[META202]] = distinct !{[[META202]], [[META203:![0-9]+]], !"_mm256_set_epi32: %agg.result"}
+// X86: [[META203]] = distinct !{[[META203]], !"_mm256_set_epi32"}
+// X86: [[META204]] = !{[[META205:![0-9]+]]}
+// X86: [[META205]] = distinct !{[[META205]], [[META206:![0-9]+]], !"_mm256_set_epi64x: %agg.result"}
+// X86: [[META206]] = distinct !{[[META206]], !"_mm256_set_epi64x"}
+// X86: [[META207]] = !{[[META208:![0-9]+]]}
+// X86: [[META208]] = distinct !{[[META208]], [[META209:![0-9]+]], !"_mm256_set_m128: %agg.result"}
+// X86: [[META209]] = distinct !{[[META209]], !"_mm256_set_m128"}
+// X86: [[META210]] = !{[[META211:![0-9]+]]}
+// X86: [[META211]] = distinct !{[[META211]], [[META212:![0-9]+]], !"_mm256_set_m128d: %agg.result"}
+// X86: [[META212]] = distinct !{[[META212]], !"_mm256_set_m128d"}
+// X86: [[META213]] = !{[[META214:![0-9]+]]}
+// X86: [[META214]] = distinct !{[[META214]], [[META215:![0-9]+]], !"_mm256_set_m128i: %agg.result"}
+// X86: [[META215]] = distinct !{[[META215]], !"_mm256_set_m128i"}
+// X86: [[META216]] = !{[[META217:![0-9]+]]}
+// X86: [[META217]] = distinct !{[[META217]], [[META218:![0-9]+]], !"_mm256_set_pd: %agg.result"}
+// X86: [[META218]] = distinct !{[[META218]], !"_mm256_set_pd"}
+// X86: [[META219]] = !{[[META220:![0-9]+]]}
+// X86: [[META220]] = distinct !{[[META220]], [[META221:![0-9]+]], !"_mm256_set_ps: %agg.result"}
+// X86: [[META221]] = distinct !{[[META221]], !"_mm256_set_ps"}
+// X86: [[META229]] = !{[[META230:![0-9]+]]}
+// X86: [[META230]] = distinct !{[[META230]], [[META231:![0-9]+]], !"_mm256_set1_epi16: %agg.result"}
+// X86: [[META231]] = distinct !{[[META231]], !"_mm256_set1_epi16"}
+// X86: [[META232]] = !{[[META233:![0-9]+]]}
+// X86: [[META233]] = distinct !{[[META233]], [[META234:![0-9]+]], !"_mm256_set_epi16: %agg.result"}
+// X86: [[META234]] = distinct !{[[META234]], !"_mm256_set_epi16"}
+// X86: [[META235]] = !{[[META233]], [[META230]]}
+// X86: [[META236]] = !{[[META237:![0-9]+]]}
+// X86: [[META237]] = distinct !{[[META237]], [[META238:![0-9]+]], !"_mm256_set1_epi32: %agg.result"}
+// X86: [[META238]] = distinct !{[[META238]], !"_mm256_set1_epi32"}
+// X86: [[META239]] = !{[[META240:![0-9]+]]}
+// X86: [[META240]] = distinct !{[[META240]], [[META241:![0-9]+]], !"_mm256_set_epi32: %agg.result"}
+// X86: [[META241]] = distinct !{[[META241]], !"_mm256_set_epi32"}
+// X86: [[META242]] = !{[[META240]], [[META237]]}
+// X86: [[META243]] = !{[[META244:![0-9]+]]}
+// X86: [[META244]] = distinct !{[[META244]], [[META245:![0-9]+]], !"_mm256_set1_epi64x: %agg.result"}
+// X86: [[META245]] = distinct !{[[META245]], !"_mm256_set1_epi64x"}
+// X86: [[META246]] = !{[[META247:![0-9]+]]}
+// X86: [[META247]] = distinct !{[[META247]], [[META248:![0-9]+]], !"_mm256_set_epi64x: %agg.result"}
+// X86: [[META248]] = distinct !{[[META248]], !"_mm256_set_epi64x"}
+// X86: [[META249]] = !{[[META247]], [[META244]]}
+// X86: [[META250]] = !{[[META251:![0-9]+]]}
+// X86: [[META251]] = distinct !{[[META251]], [[META252:![0-9]+]], !"_mm256_set1_pd: %agg.result"}
+// X86: [[META252]] = distinct !{[[META252]], !"_mm256_set1_pd"}
+// X86: [[META253]] = !{[[META254:![0-9]+]]}
+// X86: [[META254]] = distinct !{[[META254]], [[META255:![0-9]+]], !"_mm256_set_pd: %agg.result"}
+// X86: [[META255]] = distinct !{[[META255]], !"_mm256_set_pd"}
+// X86: [[META256]] = !{[[META254]], [[META251]]}
+// X86: [[META257]] = !{[[META258:![0-9]+]]}
+// X86: [[META258]] = distinct !{[[META258]], [[META259:![0-9]+]], !"_mm256_set1_ps: %agg.result"}
+// X86: [[META259]] = distinct !{[[META259]], !"_mm256_set1_ps"}
+// X86: [[META260]] = !{[[META261:![0-9]+]]}
+// X86: [[META261]] = distinct !{[[META261]], [[META262:![0-9]+]], !"_mm256_set_ps: %agg.result"}
+// X86: [[META262]] = distinct !{[[META262]], !"_mm256_set_ps"}
+// X86: [[META263]] = !{[[META261]], [[META258]]}
+// X86: [[META271]] = !{[[META272:![0-9]+]]}
+// X86: [[META272]] = distinct !{[[META272]], [[META273:![0-9]+]], !"_mm256_setr_epi16: %agg.result"}
+// X86: [[META273]] = distinct !{[[META273]], !"_mm256_setr_epi16"}
+// X86: [[META274]] = !{[[META275:![0-9]+]]}
+// X86: [[META275]] = distinct !{[[META275]], [[META276:![0-9]+]], !"_mm256_set_epi16: %agg.result"}
+// X86: [[META276]] = distinct !{[[META276]], !"_mm256_set_epi16"}
+// X86: [[META277]] = !{[[META275]], [[META272]]}
+// X86: [[META278]] = !{[[META279:![0-9]+]]}
+// X86: [[META279]] = distinct !{[[META279]], [[META280:![0-9]+]], !"_mm256_setr_epi32: %agg.result"}
+// X86: [[META280]] = distinct !{[[META280]], !"_mm256_setr_epi32"}
+// X86: [[META281]] = !{[[META282:![0-9]+]]}
+// X86: [[META282]] = distinct !{[[META282]], [[META283:![0-9]+]], !"_mm256_set_epi32: %agg.result"}
+// X86: [[META283]] = distinct !{[[META283]], !"_mm256_set_epi32"}
+// X86: [[META284]] = !{[[META282]], [[META279]]}
+// X86: [[META285]] = !{[[META286:![0-9]+]]}
+// X86: [[META286]] = distinct !{[[META286]], [[META287:![0-9]+]], !"_mm256_setr_epi64x: %agg.result"}
+// X86: [[META287]] = distinct !{[[META287]], !"_mm256_setr_epi64x"}
+// X86: [[META288]] = !{[[META289:![0-9]+]]}
+// X86: [[META289]] = distinct !{[[META289]], [[META290:![0-9]+]], !"_mm256_set_epi64x: %agg.result"}
+// X86: [[META290]] = distinct !{[[META290]], !"_mm256_set_epi64x"}
+// X86: [[META291]] = !{[[META289]], [[META286]]}
+// X86: [[META292]] = !{[[META293:![0-9]+]]}
+// X86: [[META293]] = distinct !{[[META293]], [[META294:![0-9]+]], !"_mm256_setr_m128: %agg.result"}
+// X86: [[META294]] = distinct !{[[META294]], !"_mm256_setr_m128"}
+// X86: [[META295]] = !{[[META296:![0-9]+]]}
+// X86: [[META296]] = distinct !{[[META296]], [[META297:![0-9]+]], !"_mm256_set_m128: %agg.result"}
+// X86: [[META297]] = distinct !{[[META297]], !"_mm256_set_m128"}
+// X86: [[META298]] = !{[[META296]], [[META293]]}
+// X86: [[META299]] = !{[[META300:![0-9]+]]}
+// X86: [[META300]] = distinct !{[[META300]], [[META301:![0-9]+]], !"_mm256_setr_m128d: %agg.result"}
+// X86: [[META301]] = distinct !{[[META301]], !"_mm256_setr_m128d"}
+// X86: [[META302]] = !{[[META303:![0-9]+]]}
+// X86: [[META303]] = distinct !{[[META303]], [[META304:![0-9]+]], !"_mm256_set_m128d: %agg.result"}
+// X86: [[META304]] = distinct !{[[META304]], !"_mm256_set_m128d"}
+// X86: [[META305]] = !{[[META303]], [[META300]]}
+// X86: [[META306]] = !{[[META307:![0-9]+]]}
+// X86: [[META307]] = distinct !{[[META307]], [[META308:![0-9]+]], !"_mm256_setr_m128i: %agg.result"}
+// X86: [[META308]] = distinct !{[[META308]], !"_mm256_setr_m128i"}
+// X86: [[META309]] = !{[[META310:![0-9]+]]}
+// X86: [[META310]] = distinct !{[[META310]], [[META311:![0-9]+]], !"_mm256_set_m128i: %agg.result"}
+// X86: [[META311]] = distinct !{[[META311]], !"_mm256_set_m128i"}
+// X86: [[META312]] = !{[[META310]], [[META307]]}
+// X86: [[META313]] = !{[[META314:![0-9]+]]}
+// X86: [[META314]] = distinct !{[[META314]], [[META315:![0-9]+]], !"_mm256_setr_pd: %agg.result"}
+// X86: [[META315]] = distinct !{[[META315]], !"_mm256_setr_pd"}
+// X86: [[META316]] = !{[[META317:![0-9]+]]}
+// X86: [[META317]] = distinct !{[[META317]], [[META318:![0-9]+]], !"_mm256_set_pd: %agg.result"}
+// X86: [[META318]] = distinct !{[[META318]], !"_mm256_set_pd"}
+// X86: [[META319]] = !{[[META317]], [[META314]]}
+// X86: [[META320]] = !{[[META321:![0-9]+]]}
+// X86: [[META321]] = distinct !{[[META321]], [[META322:![0-9]+]], !"_mm256_setr_ps: %agg.result"}
+// X86: [[META322]] = distinct !{[[META322]], !"_mm256_setr_ps"}
+// X86: [[META323]] = !{[[META324:![0-9]+]]}
+// X86: [[META324]] = distinct !{[[META324]], [[META325:![0-9]+]], !"_mm256_set_ps: %agg.result"}
+// X86: [[META325]] = distinct !{[[META325]], !"_mm256_set_ps"}
+// X86: [[META326]] = !{[[META324]], [[META321]]}
+// X86: [[META327]] = !{[[META328:![0-9]+]]}
+// X86: [[META328]] = distinct !{[[META328]], [[META329:![0-9]+]], !"_mm256_setzero_pd: %agg.result"}
+// X86: [[META329]] = distinct !{[[META329]], !"_mm256_setzero_pd"}
+// X86: [[META330]] = !{[[META331:![0-9]+]]}
+// X86: [[META331]] = distinct !{[[META331]], [[META332:![0-9]+]], !"_mm256_setzero_ps: %agg.result"}
+// X86: [[META332]] = distinct !{[[META332]], !"_mm256_setzero_ps"}
+// X86: [[META333]] = !{[[META334:![0-9]+]]}
+// X86: [[META334]] = distinct !{[[META334]], [[META335:![0-9]+]], !"_mm256_setzero_si256: %agg.result"}
+// X86: [[META335]] = distinct !{[[META335]], !"_mm256_setzero_si256"}
+// X86: [[META336]] = !{[[META337:![0-9]+]]}
+// X86: [[META337]] = distinct !{[[META337]], [[META338:![0-9]+]], !"_mm256_sqrt_pd: %agg.result"}
+// X86: [[META338]] = distinct !{[[META338]], !"_mm256_sqrt_pd"}
+// X86: [[META339]] = !{[[META340:![0-9]+]]}
+// X86: [[META340]] = distinct !{[[META340]], [[META341:![0-9]+]], !"_mm256_sqrt_ps: %agg.result"}
+// X86: [[META341]] = distinct !{[[META341]], !"_mm256_sqrt_ps"}
+// X86: [[META342]] = !{i32 1}
+// X86: [[META343]] = !{[[META344:![0-9]+]]}
+// X86: [[META344]] = distinct !{[[META344]], [[META345:![0-9]+]], !"_mm256_sub_pd: %agg.result"}
+// X86: [[META345]] = distinct !{[[META345]], !"_mm256_sub_pd"}
+// X86: [[META346]] = !{[[META347:![0-9]+]]}
+// X86: [[META347]] = distinct !{[[META347]], [[META348:![0-9]+]], !"_mm256_sub_ps: %agg.result"}
+// X86: [[META348]] = distinct !{[[META348]], !"_mm256_sub_ps"}
+// X86: [[META349]] = !{[[META350:![0-9]+]]}
+// X86: [[META350]] = distinct !{[[META350]], [[META351:![0-9]+]], !"_mm256_undefined_ps: %agg.result"}
+// X86: [[META351]] = distinct !{[[META351]], !"_mm256_undefined_ps"}
+// X86: [[META352]] = !{[[META353:![0-9]+]]}
+// X86: [[META353]] = distinct !{[[META353]], [[META354:![0-9]+]], !"_mm256_undefined_pd: %agg.result"}
+// X86: [[META354]] = distinct !{[[META354]], !"_mm256_undefined_pd"}
+// X86: [[META355]] = !{[[META356:![0-9]+]]}
+// X86: [[META356]] = distinct !{[[META356]], [[META357:![0-9]+]], !"_mm256_undefined_si256: %agg.result"}
+// X86: [[META357]] = distinct !{[[META357]], !"_mm256_undefined_si256"}
+// X86: [[META358]] = !{[[META359:![0-9]+]]}
+// X86: [[META359]] = distinct !{[[META359]], [[META360:![0-9]+]], !"_mm256_unpackhi_pd: %agg.result"}
+// X86: [[META360]] = distinct !{[[META360]], !"_mm256_unpackhi_pd"}
+// X86: [[META361]] = !{[[META362:![0-9]+]]}
+// X86: [[META362]] = distinct !{[[META362]], [[META363:![0-9]+]], !"_mm256_unpackhi_ps: %agg.result"}
+// X86: [[META363]] = distinct !{[[META363]], !"_mm256_unpackhi_ps"}
+// X86: [[META364]] = !{[[META365:![0-9]+]]}
+// X86: [[META365]] = distinct !{[[META365]], [[META366:![0-9]+]], !"_mm256_unpacklo_pd: %agg.result"}
+// X86: [[META366]] = distinct !{[[META366]], !"_mm256_unpacklo_pd"}
+// X86: [[META367]] = !{[[META368:![0-9]+]]}
+// X86: [[META368]] = distinct !{[[META368]], [[META369:![0-9]+]], !"_mm256_unpacklo_ps: %agg.result"}
+// X86: [[META369]] = distinct !{[[META369]], !"_mm256_unpacklo_ps"}
+// X86: [[META370]] = !{[[META371:![0-9]+]]}
+// X86: [[META371]] = distinct !{[[META371]], [[META372:![0-9]+]], !"_mm256_xor_pd: %agg.result"}
+// X86: [[META372]] = distinct !{[[META372]], !"_mm256_xor_pd"}
+// X86: [[META373]] = !{[[META374:![0-9]+]]}
+// X86: [[META374]] = distinct !{[[META374]], [[META375:![0-9]+]], !"_mm256_xor_ps: %agg.result"}
+// X86: [[META375]] = distinct !{[[META375]], !"_mm256_xor_ps"}
+// X86: [[META376]] = !{[[META377:![0-9]+]]}
+// X86: [[META377]] = distinct !{[[META377]], [[META378:![0-9]+]], !"_mm256_zextpd128_pd256: %agg.result"}
+// X86: [[META378]] = distinct !{[[META378]], !"_mm256_zextpd128_pd256"}
+// X86: [[META379]] = !{[[META380:![0-9]+]]}
+// X86: [[META380]] = distinct !{[[META380]], [[META381:![0-9]+]], !"_mm256_zextps128_ps256: %agg.result"}
+// X86: [[META381]] = distinct !{[[META381]], !"_mm256_zextps128_ps256"}
+// X86: [[META382]] = !{[[META383:![0-9]+]]}
+// X86: [[META383]] = distinct !{[[META383]], [[META384:![0-9]+]], !"_mm256_zextsi128_si256: %agg.result"}
+// X86: [[META384]] = distinct !{[[META384]], !"_mm256_zextsi128_si256"}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
+// X64: {{.*}}
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index fd72e25afdb45c..636f7e125214c5 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
@@ -9,107 +10,70 @@
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
 
 __m256i test_mm256_abs_epi8(__m256i a) {
-  // CHECK-LABEL: test_mm256_abs_epi8
-  // CHECK: [[ABS:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %{{.*}}, i1 false)
   return _mm256_abs_epi8(a);
 }
 
 __m256i test_mm256_abs_epi16(__m256i a) {
-  // CHECK-LABEL: test_mm256_abs_epi16
-  // CHECK: [[ABS:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %{{.*}}, i1 false)
   return _mm256_abs_epi16(a);
 }
 
 __m256i test_mm256_abs_epi32(__m256i a) {
-  // CHECK-LABEL: test_mm256_abs_epi32
-  // CHECK: [[ABS:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %{{.*}}, i1 false)
   return _mm256_abs_epi32(a);
 }
 
 __m256i test_mm256_add_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_add_epi8
-  // CHECK: add <32 x i8>
   return _mm256_add_epi8(a, b);
 }
 
 __m256i test_mm256_add_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_add_epi16
-  // CHECK: add <16 x i16>
   return _mm256_add_epi16(a, b);
 }
 
 __m256i test_mm256_add_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_add_epi32
-  // CHECK: add <8 x i32>
   return _mm256_add_epi32(a, b);
 }
 
 __m256i test_mm256_add_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_add_epi64
-  // CHECK: add <4 x i64>
   return _mm256_add_epi64(a, b);
 }
 
 __m256i test_mm256_adds_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_adds_epi8
-  // CHECK: call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_adds_epi8(a, b);
 }
 
 __m256i test_mm256_adds_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_adds_epi16
-  // CHECK: call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_adds_epi16(a, b);
 }
 
 __m256i test_mm256_adds_epu8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_adds_epu8
-  // CHECK-NOT: call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
-  // CHECK: call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_adds_epu8(a, b);
 }
 
 __m256i test_mm256_adds_epu16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_adds_epu16
-  // CHECK-NOT: call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
-  // CHECK: call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_adds_epu16(a, b);
 }
 
 __m256i test_mm256_alignr_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_alignr_epi8
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
   return _mm256_alignr_epi8(a, b, 2);
 }
 
 __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test2_mm256_alignr_epi8
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
   return _mm256_alignr_epi8(a, b, 17);
 }
 
 __m256i test_mm256_and_si256(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_and_si256
-  // CHECK: and <4 x i64>
   return _mm256_and_si256(a, b);
 }
 
 __m256i test_mm256_andnot_si256(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_andnot_si256
-  // CHECK: xor <4 x i64>
-  // CHECK: and <4 x i64>
   return _mm256_andnot_si256(a, b);
 }
 
 __m256i test_mm256_avg_epu8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_avg_epu8
-  // CHECK: call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_avg_epu8(a, b);
 }
 
 __m256i test_mm256_avg_epu16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_avg_epu16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_avg_epu16(a, b);
 }
 
@@ -117,1218 +81,757 @@ __m256i test_mm256_avg_epu16(__m256i a, __m256i b) {
 // functions to this IR. In the future we could delete the corresponding
 // intrinsic in LLVM if it's not being used anymore.
 __m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_blend_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pblendw
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm256_blend_epi16(a, b, 2);
 }
 
 __m128i test_mm_blend_epi32(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_blend_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pblendd.128
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
   return _mm_blend_epi32(a, b, 0x05);
 }
 
 __m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_blend_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pblendd.256
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
   return _mm256_blend_epi32(a, b, 0x35);
 }
 
 __m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
-  // CHECK-LABEL: test_mm256_blendv_epi8
-  // CHECK: call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_blendv_epi8(a, b, m);
 }
 
 __m128i test_mm_broadcastb_epi8(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastb_epi8
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
   return _mm_broadcastb_epi8(a);
 }
 
 __m256i test_mm256_broadcastb_epi8(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastb_epi8
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.256
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
   return _mm256_broadcastb_epi8(a);
 }
 
 __m128i test_mm_broadcastd_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastd_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.128
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
   return _mm_broadcastd_epi32(a);
 }
 
 __m256i test_mm256_broadcastd_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastd_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.256
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
   return _mm256_broadcastd_epi32(a);
 }
 
 __m128i test_mm_broadcastq_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastq_epi64
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.128
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
   return _mm_broadcastq_epi64(a);
 }
 
 __m256i test_mm256_broadcastq_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastq_epi64
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
   return _mm256_broadcastq_epi64(a);
 }
 
 __m128d test_mm_broadcastsd_pd(__m128d a) {
-  // CHECK-LABEL: test_mm_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
   return _mm_broadcastsd_pd(a);
 }
 
 __m256d test_mm256_broadcastsd_pd(__m128d a) {
-  // CHECK-LABEL: test_mm256_broadcastsd_pd
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.sd.pd.256
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
   return _mm256_broadcastsd_pd(a);
 }
 
 __m256i test_mm256_broadcastsi128_si256(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastsi128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm256_broadcastsi128_si256(a);
 }
 
 __m256i test_mm_broadcastsi128_si256(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastsi128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   return _mm_broadcastsi128_si256(a);
 }
 
 __m128 test_mm_broadcastss_ps(__m128 a) {
-  // CHECK-LABEL: test_mm_broadcastss_ps
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
   return _mm_broadcastss_ps(a);
 }
 
 __m256 test_mm256_broadcastss_ps(__m128 a) {
-  // CHECK-LABEL: test_mm256_broadcastss_ps
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps.256
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
   return _mm256_broadcastss_ps(a);
 }
 
 __m128i test_mm_broadcastw_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastw_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.128
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
   return _mm_broadcastw_epi16(a);
 }
 
 __m256i test_mm256_broadcastw_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastw_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.256
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
   return _mm256_broadcastw_epi16(a);
 }
 
 __m256i test_mm256_bslli_epi128(__m256i a) {
-  // CHECK-LABEL: test_mm256_bslli_epi128
-  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   return _mm256_bslli_epi128(a, 3);
 }
 
 __m256i test_mm256_bsrli_epi128(__m256i a) {
-  // CHECK-LABEL: test_mm256_bsrli_epi128
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   return _mm256_bsrli_epi128(a, 3);
 }
 
 __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpeq_epi8
-  // CHECK: icmp eq <32 x i8>
   return _mm256_cmpeq_epi8(a, b);
 }
 
 __m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpeq_epi16
-  // CHECK: icmp eq <16 x i16>
   return _mm256_cmpeq_epi16(a, b);
 }
 
 __m256i test_mm256_cmpeq_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpeq_epi32
-  // CHECK: icmp eq <8 x i32>
   return _mm256_cmpeq_epi32(a, b);
 }
 
 __m256i test_mm256_cmpeq_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpeq_epi64
-  // CHECK: icmp eq <4 x i64>
   return _mm256_cmpeq_epi64(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpgt_epi8
-  // CHECK: icmp sgt <32 x i8>
   return _mm256_cmpgt_epi8(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpgt_epi16
-  // CHECK: icmp sgt <16 x i16>
   return _mm256_cmpgt_epi16(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpgt_epi32
-  // CHECK: icmp sgt <8 x i32>
   return _mm256_cmpgt_epi32(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_cmpgt_epi64
-  // CHECK: icmp sgt <4 x i64>
   return _mm256_cmpgt_epi64(a, b);
 }
 
 __m256i test_mm256_cvtepi8_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepi8_epi16
-  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
   return _mm256_cvtepi8_epi16(a);
 }
 
 __m256i test_mm256_cvtepi8_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepi8_epi32
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
   return _mm256_cvtepi8_epi32(a);
 }
 
 __m256i test_mm256_cvtepi8_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepi8_epi64
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
   return _mm256_cvtepi8_epi64(a);
 }
 
 __m256i test_mm256_cvtepi16_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepi16_epi32
-  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
   return _mm256_cvtepi16_epi32(a);
 }
 
 __m256i test_mm256_cvtepi16_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepi16_epi64
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
   return _mm256_cvtepi16_epi64(a);
 }
 
 __m256i test_mm256_cvtepi32_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepi32_epi64
-  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
   return _mm256_cvtepi32_epi64(a);
 }
 
 __m256i test_mm256_cvtepu8_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepu8_epi16
-  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
   return _mm256_cvtepu8_epi16(a);
 }
 
 __m256i test_mm256_cvtepu8_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepu8_epi32
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
   return _mm256_cvtepu8_epi32(a);
 }
 
 __m256i test_mm256_cvtepu8_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepu8_epi64
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
   return _mm256_cvtepu8_epi64(a);
 }
 
 __m256i test_mm256_cvtepu16_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepu16_epi32
-  // CHECK: zext <8 x i16> {{.*}} to <8 x i32>
   return _mm256_cvtepu16_epi32(a);
 }
 
 __m256i test_mm256_cvtepu16_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepu16_epi64
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
   return _mm256_cvtepu16_epi64(a);
 }
 
 __m256i test_mm256_cvtepu32_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_cvtepu32_epi64
-  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
   return _mm256_cvtepu32_epi64(a);
 }
 
 __m128i test0_mm256_extracti128_si256_0(__m256i a) {
-  // CHECK-LABEL: test0_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
   return _mm256_extracti128_si256(a, 0);
 }
 
 __m128i test1_mm256_extracti128_si256_1(__m256i a) {
-  // CHECK-LABEL: test1_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
   return _mm256_extracti128_si256(a, 1);
 }
 
 // Immediate should be truncated to one bit.
 __m128i test2_mm256_extracti128_si256(__m256i a) {
-  // CHECK-LABEL: test2_mm256_extracti128_si256
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
   return _mm256_extracti128_si256(a, 0);
 }
 
 __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_hadd_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadd_epi16(a, b);
 }
 
 __m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_hadd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hadd_epi32(a, b);
 }
 
 __m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_hadds_epi16
-  // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hadds_epi16(a, b);
 }
 
 __m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_hsub_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsub_epi16(a, b);
 }
 
 __m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_hsub_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_hsub_epi32(a, b);
 }
 
 __m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_hsubs_epi16
-  // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_hsubs_epi16(a, b);
 }
 
 __m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
-  // CHECK-LABEL: test_mm_i32gather_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
   return _mm_i32gather_epi32(b, c, 2);
 }
 
 __m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c, __m128i d) {
-  // CHECK-LABEL: test_mm_mask_i32gather_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
   return _mm_mask_i32gather_epi32(a, b, c, d, 2);
 }
 
 __m256i test_mm256_i32gather_epi32(int const *b, __m256i c) {
-  // CHECK-LABEL: test_mm256_i32gather_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %{{.*}}, ptr %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 2)
   return _mm256_i32gather_epi32(b, c, 2);
 }
 
 __m256i test_mm256_mask_i32gather_epi32(__m256i a, int const *b, __m256i c, __m256i d) {
-  // CHECK-LABEL: test_mm256_mask_i32gather_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %{{.*}}, ptr %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 2)
   return _mm256_mask_i32gather_epi32(a, b, c, d, 2);
 }
 
 __m128i test_mm_i32gather_epi64(long long const *b, __m128i c) {
-  // CHECK-LABEL: test_mm_i32gather_epi64
-  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> zeroinitializer, ptr %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
   return _mm_i32gather_epi64(b, c, 2);
 }
 
 __m128i test_mm_mask_i32gather_epi64(__m128i a, long long const *b, __m128i c, __m128i d) {
-  // CHECK-LABEL: test_mm_mask_i32gather_epi64
-  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
   return _mm_mask_i32gather_epi64(a, b, c, d, 2);
 }
 
 __m256i test_mm256_i32gather_epi64(long long const *b, __m128i c) {
-  // X64-LABEL: test_mm256_i32gather_epi64
-  // X64: call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> zeroinitializer, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm256_i32gather_epi64
-  // X86: call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
   return _mm256_i32gather_epi64(b, c, 2);
 }
 
 __m256i test_mm256_mask_i32gather_epi64(__m256i a, long long const *b, __m128i c, __m256i d) {
-  // CHECK-LABEL: test_mm256_mask_i32gather_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
   return _mm256_mask_i32gather_epi64(a, b, c, d, 2);
 }
 
 __m128d test_mm_i32gather_pd(double const *b, __m128i c) {
-  // X64-LABEL: test_mm_i32gather_pd
-  // X64:         [[CMP:%.*]] = fcmp oeq <2 x double>
-  // X64-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // X64-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
-  // X64: call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> zeroinitializer, ptr %{{.*}}, <4 x i32> %{{.*}}, <2 x double> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm_i32gather_pd
-  // X86:         [[CMP:%.*]] = fcmp oeq <2 x double>
-  // X86-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // X86-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
-  // X86: call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <2 x double> %{{.*}}, i8 2)
   return _mm_i32gather_pd(b, c, 2);
 }
 
 __m128d test_mm_mask_i32gather_pd(__m128d a, double const *b, __m128i c, __m128d d) {
-  // CHECK-LABEL: test_mm_mask_i32gather_pd
-  // CHECK: call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <2 x double> %{{.*}}, i8 2)
   return _mm_mask_i32gather_pd(a, b, c, d, 2);
 }
 
 __m256d test_mm256_i32gather_pd(double const *b, __m128i c) {
-  // X64-LABEL: test_mm256_i32gather_pd
-  // X64:         [[CMP:%.*]] = fcmp oeq <4 x double>
-  // X64-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
-  // X64-NEXT:    [[BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <4 x double>
-  // X64: call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> zeroinitializer, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x double> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm256_i32gather_pd
-  // X86:         [[CMP:%.*]] = fcmp oeq <4 x double>
-  // X86-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
-  // X86-NEXT:    [[BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <4 x double>
-  // X86: call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x double> %{{.*}}, i8 2)
   return _mm256_i32gather_pd(b, c, 2);
 }
 
 __m256d test_mm256_mask_i32gather_pd(__m256d a, double const *b, __m128i c, __m256d d) {
-  // CHECK-LABEL: test_mm256_mask_i32gather_pd
-  // CHECK: call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x double> %{{.*}}, i8 2)
   return _mm256_mask_i32gather_pd(a, b, c, d, 2);
 }
 
 __m128 test_mm_i32gather_ps(float const *b, __m128i c) {
-  // X64-LABEL: test_mm_i32gather_ps
-  // X64:         [[CMP:%.*]] = fcmp oeq <4 x float>
-  // X64-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // X64-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // X64: call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> zeroinitializer, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm_i32gather_ps
-  // X86:         [[CMP:%.*]] = fcmp oeq <4 x float>
-  // X86-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // X86-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // X86: call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   return _mm_i32gather_ps(b, c, 2);
 }
 
 __m128 test_mm_mask_i32gather_ps(__m128 a, float const *b, __m128i c, __m128 d) {
-  // CHECK-LABEL: test_mm_mask_i32gather_ps
-  // CHECK: call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %{{.*}}, ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   return _mm_mask_i32gather_ps(a, b, c, d, 2);
 }
 
 __m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
-  // X64-LABEL: test_mm256_i32gather_ps
-  // X64:         [[CMP:%.*]] = fcmp oeq <8 x float>
-  // X64-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
-  // X64-NEXT:    [[BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <8 x float>
-  // X64: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> zeroinitializer, ptr %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm256_i32gather_ps
-  // X86:         [[CMP:%.*]] = fcmp oeq <8 x float>
-  // X86-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
-  // X86-NEXT:    [[BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <8 x float>
-  // X86: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %{{.*}}, ptr %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
   return _mm256_i32gather_ps(b, c, 2);
 }
 
 __m256 test_mm256_mask_i32gather_ps(__m256 a, float const *b, __m256i c, __m256 d) {
-  // CHECK-LABEL: test_mm256_mask_i32gather_ps
-  // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %{{.*}}, ptr %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
   return _mm256_mask_i32gather_ps(a, b, c, d, 2);
 }
 
 __m128i test_mm_i64gather_epi32(int const *b, __m128i c) {
-  // CHECK-LABEL: test_mm_i64gather_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
   return _mm_i64gather_epi32(b, c, 2);
 }
 
 __m128i test_mm_mask_i64gather_epi32(__m128i a, int const *b, __m128i c, __m128i d) {
-  // CHECK-LABEL: test_mm_mask_i64gather_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %{{.*}}, ptr %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
   return _mm_mask_i64gather_epi32(a, b, c, d, 2);
 }
 
 __m128i test_mm256_i64gather_epi32(int const *b, __m256i c) {
-  // CHECK-LABEL: test_mm256_i64gather_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
   return _mm256_i64gather_epi32(b, c, 2);
 }
 
 __m128i test_mm256_mask_i64gather_epi32(__m128i a, int const *b, __m256i c, __m128i d) {
-  // CHECK-LABEL: test_mm256_mask_i64gather_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
   return _mm256_mask_i64gather_epi32(a, b, c, d, 2);
 }
 
 __m128i test_mm_i64gather_epi64(long long const *b, __m128i c) {
-  // CHECK-LABEL: test_mm_i64gather_epi64
-  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> zeroinitializer, ptr %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
   return _mm_i64gather_epi64(b, c, 2);
 }
 
 __m128i test_mm_mask_i64gather_epi64(__m128i a, long long const *b, __m128i c, __m128i d) {
-  // CHECK-LABEL: test_mm_mask_i64gather_epi64
-  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %{{.*}}, ptr %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
   return _mm_mask_i64gather_epi64(a, b, c, d, 2);
 }
 
 __m256i test_mm256_i64gather_epi64(long long const *b, __m256i c) {
-  // X64-LABEL: test_mm256_i64gather_epi64
-  // X64: call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> zeroinitializer, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm256_i64gather_epi64
-  // X86: call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
   return _mm256_i64gather_epi64(b, c, 2);
 }
 
 __m256i test_mm256_mask_i64gather_epi64(__m256i a, long long const *b, __m256i c, __m256i d) {
-  // CHECK-LABEL: test_mm256_mask_i64gather_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
   return _mm256_mask_i64gather_epi64(a, b, c, d, 2);
 }
 
 __m128d test_mm_i64gather_pd(double const *b, __m128i c) {
-  // X64-LABEL: test_mm_i64gather_pd
-  // X64:         [[CMP:%.*]] = fcmp oeq <2 x double>
-  // X64-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // X64-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
-  // X64: call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> zeroinitializer, ptr %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm_i64gather_pd
-  // X86:         [[CMP:%.*]] = fcmp oeq <2 x double>
-  // X86-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // X86-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
-  // X86: call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %{{.*}}, ptr %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}}, i8 2)
   return _mm_i64gather_pd(b, c, 2);
 }
 
 __m128d test_mm_mask_i64gather_pd(__m128d a, double const *b, __m128i c, __m128d d) {
-  // CHECK-LABEL: test_mm_mask_i64gather_pd
-  // CHECK: call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %{{.*}}, ptr %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}}, i8 2)
   return _mm_mask_i64gather_pd(a, b, c, d, 2);
 }
 
 __m256d test_mm256_i64gather_pd(double const *b, __m256i c) {
-  // X64-LABEL: test_mm256_i64gather_pd
-  // X64: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
-  // X64: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> zeroinitializer, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm256_i64gather_pd
-  // X86: fcmp oeq <4 x double> %{{.*}}, %{{.*}}
-  // X86: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
   return _mm256_i64gather_pd(b, c, 2);
 }
 
 __m256d test_mm256_mask_i64gather_pd(__m256d a, double const *b, __m256i c, __m256d d) {
-  // CHECK-LABEL: test_mm256_mask_i64gather_pd
-  // CHECK: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
   return _mm256_mask_i64gather_pd(a, b, c, d, 2);
 }
 
 __m128 test_mm_i64gather_ps(float const *b, __m128i c) {
-  // X64-LABEL: test_mm_i64gather_ps
-  // X64:         [[CMP:%.*]] = fcmp oeq <4 x float>
-  // X64-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // X64-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // X64: call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> zeroinitializer, ptr %{{.*}}, <2 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm_i64gather_ps
-  // X86:         [[CMP:%.*]] = fcmp oeq <4 x float>
-  // X86-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // X86-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // X86: call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %{{.*}}, ptr %{{.*}}, <2 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   return _mm_i64gather_ps(b, c, 2);
 }
 
 __m128 test_mm_mask_i64gather_ps(__m128 a, float const *b, __m128i c, __m128 d) {
-  // CHECK-LABEL: test_mm_mask_i64gather_ps
-  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %{{.*}}, ptr %{{.*}}, <2 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   return _mm_mask_i64gather_ps(a, b, c, d, 2);
 }
 
 __m128 test_mm256_i64gather_ps(float const *b, __m256i c) {
-  // X64-LABEL: test_mm256_i64gather_ps
-  // X64:         [[CMP:%.*]] = fcmp oeq <4 x float>
-  // X64-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // X64-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // X64: call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> zeroinitializer, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   //
-  // X86-LABEL: test_mm256_i64gather_ps
-  // X86:         [[CMP:%.*]] = fcmp oeq <4 x float>
-  // X86-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // X86-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // X86: call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   return _mm256_i64gather_ps(b, c, 2);
 }
 
 __m128 test_mm256_mask_i64gather_ps(__m128 a, float const *b, __m256i c, __m128 d) {
-  // CHECK-LABEL: test_mm256_mask_i64gather_ps
-  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %{{.*}}, ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   return _mm256_mask_i64gather_ps(a, b, c, d, 2);
 }
 
 __m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
-  // CHECK-LABEL: test0_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_inserti128_si256(a, b, 0);
 }
 
 __m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
-  // CHECK-LABEL: test1_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm256_inserti128_si256(a, b, 1);
 }
 
 // Immediate should be truncated to one bit.
 __m256i test2_mm256_inserti128_si256(__m256i a, __m128i b) {
-  // CHECK-LABEL: test2_mm256_inserti128_si256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm256_inserti128_si256(a, b, 0);
 }
 
 __m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_madd_epi16
-  // CHECK: call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_madd_epi16(a, b);
 }
 
 __m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_maddubs_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_maddubs_epi16(a, b);
 }
 
 __m128i test_mm_maskload_epi32(int const *a, __m128i m) {
-  // CHECK-LABEL: test_mm_maskload_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maskload_epi32(a, m);
 }
 
 __m256i test_mm256_maskload_epi32(int const *a, __m256i m) {
-  // CHECK-LABEL: test_mm256_maskload_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_maskload_epi32(a, m);
 }
 
 __m128i test_mm_maskload_epi64(long long const *a, __m128i m) {
-  // CHECK-LABEL: test_mm_maskload_epi64
-  // CHECK: call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maskload_epi64(a, m);
 }
 
 __m256i test_mm256_maskload_epi64(long long const *a, __m256i m) {
-  // CHECK-LABEL: test_mm256_maskload_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_maskload_epi64(a, m);
 }
 
 void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) {
-  // CHECK-LABEL: test_mm_maskstore_epi32
-  // CHECK: call void @llvm.x86.avx2.maskstore.d(ptr %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   _mm_maskstore_epi32(a, m, b);
 }
 
 void test_mm256_maskstore_epi32(int *a, __m256i m, __m256i b) {
-  // CHECK-LABEL: test_mm256_maskstore_epi32
-  // CHECK: call void @llvm.x86.avx2.maskstore.d.256(ptr %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   _mm256_maskstore_epi32(a, m, b);
 }
 
 void test_mm_maskstore_epi64(long long *a, __m128i m, __m128i b) {
-  // CHECK-LABEL: test_mm_maskstore_epi64
-  // CHECK: call void @llvm.x86.avx2.maskstore.q(ptr %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   _mm_maskstore_epi64(a, m, b);
 }
 
 void test_mm256_maskstore_epi64(long long *a, __m256i m, __m256i b) {
-  // CHECK-LABEL: test_mm256_maskstore_epi64
-  // CHECK: call void @llvm.x86.avx2.maskstore.q.256(ptr %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   _mm256_maskstore_epi64(a, m, b);
 }
 
 __m256i test_mm256_max_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_max_epi8
-  // CHECK: call <32 x i8> @llvm.smax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_max_epi8(a, b);
 }
 
 __m256i test_mm256_max_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_max_epi16
-  // CHECK: call <16 x i16> @llvm.smax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_max_epi16(a, b);
 }
 
 __m256i test_mm256_max_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_max_epi32
-  // CHECK: call <8 x i32> @llvm.smax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_max_epi32(a, b);
 }
 
 __m256i test_mm256_max_epu8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_max_epu8
-  // CHECK: call <32 x i8> @llvm.umax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_max_epu8(a, b);
 }
 
 __m256i test_mm256_max_epu16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_max_epu16
-  // CHECK: call <16 x i16> @llvm.umax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_max_epu16(a, b);
 }
 
 __m256i test_mm256_max_epu32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_max_epu32
-  // CHECK: call <8 x i32> @llvm.umax.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_max_epu32(a, b);
 }
 
 __m256i test_mm256_min_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_min_epi8
-  // CHECK: call <32 x i8> @llvm.smin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_min_epi8(a, b);
 }
 
 __m256i test_mm256_min_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_min_epi16
-  // CHECK: call <16 x i16> @llvm.smin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_min_epi16(a, b);
 }
 
 __m256i test_mm256_min_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_min_epi32
-  // CHECK: call <8 x i32> @llvm.smin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_min_epi32(a, b);
 }
 
 __m256i test_mm256_min_epu8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_min_epu8
-  // CHECK: call <32 x i8> @llvm.umin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_min_epu8(a, b);
 }
 
 __m256i test_mm256_min_epu16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_min_epu16
-  // CHECK: call <16 x i16> @llvm.umin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_min_epu16(a, b);
 }
 
 __m256i test_mm256_min_epu32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_min_epu32
-  // CHECK: call <8 x i32> @llvm.umin.v8i32(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_min_epu32(a, b);
 }
 
 int test_mm256_movemask_epi8(__m256i a) {
-  // CHECK-LABEL: test_mm256_movemask_epi8
-  // CHECK: call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %{{.*}})
   return _mm256_movemask_epi8(a);
 }
 
 __m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
-  // CHECK-LABEL: test_mm256_mpsadbw_epu8
-  // CHECK: call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, i8 3)
   return _mm256_mpsadbw_epu8(x, y, 3);
 }
 
 __m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_mul_epi32
-  // CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
-  // CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
-  // CHECK: shl <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
-  // CHECK: ashr <4 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32>
-  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epi32(a, b);
 }
 
 __m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_mul_epu32
-  // CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  // CHECK: and <4 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  // CHECK: mul <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mul_epu32(a, b);
 }
 
 __m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_mulhi_epu16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_mulhi_epu16(a, b);
 }
 
 __m256i test_mm256_mulhi_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_mulhi_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_mulhi_epi16(a, b);
 }
 
 __m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_mulhrs_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_mulhrs_epi16(a, b);
 }
 
 __m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_mullo_epi16
-  // CHECK: mul <16 x i16>
   return _mm256_mullo_epi16(a, b);
 }
 
 __m256i test_mm256_mullo_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_mullo_epi32
-  // CHECK: mul <8 x i32>
   return _mm256_mullo_epi32(a, b);
 }
 
 __m256i test_mm256_or_si256(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_or_si256
-  // CHECK: or <4 x i64>
   return _mm256_or_si256(a, b);
 }
 
 __m256i test_mm256_packs_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_packs_epi16
-  // CHECK: call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_packs_epi16(a, b);
 }
 
 __m256i test_mm256_packs_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_packs_epi32
-  // CHECK: call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_packs_epi32(a, b);
 }
 
 __m256i test_mm256_packs_epu16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_packs_epu16
-  // CHECK:  call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_packus_epi16(a, b);
 }
 
 __m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_packs_epu32
-  // CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_packus_epi32(a, b);
 }
 
 __m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_permute2x128_si256
-  // CHECK: shufflevector <4 x i64> zeroinitializer, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
   return _mm256_permute2x128_si256(a, b, 0x38);
 }
 
 __m256i test_mm256_permute4x64_epi64(__m256i a) {
-  // CHECK-LABEL: test_mm256_permute4x64_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
   return _mm256_permute4x64_epi64(a, 35);
 }
 
 __m256d test_mm256_permute4x64_pd(__m256d a) {
-  // CHECK-LABEL: test_mm256_permute4x64_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
   return _mm256_permute4x64_pd(a, 25);
 }
 
 __m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_permutevar8x32_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_permutevar8x32_epi32(a, b);
 }
 
 __m256 test_mm256_permutevar8x32_ps(__m256 a, __m256i b) {
-  // CHECK-LABEL: test_mm256_permutevar8x32_ps
-  // CHECK: call <8 x float> @llvm.x86.avx2.permps(<8 x float> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_permutevar8x32_ps(a, b);
 }
 
 __m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
-  // CHECK-LABEL: test_mm256_sad_epu8
-  // CHECK: call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_sad_epu8(x, y);
 }
 
 __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_shuffle_epi8
-  // CHECK: call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_shuffle_epi8(a, b);
 }
 
 __m256i test_mm256_shuffle_epi32(__m256i a) {
-  // CHECK-LABEL: test_mm256_shuffle_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
   return _mm256_shuffle_epi32(a, 15);
 }
 
 __m256i test_mm256_shufflehi_epi16(__m256i a) {
-  // CHECK-LABEL: test_mm256_shufflehi_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
   return _mm256_shufflehi_epi16(a, 107);
 }
 
 __m256i test_mm256_shufflelo_epi16(__m256i a) {
-  // CHECK-LABEL: test_mm256_shufflelo_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
   return _mm256_shufflelo_epi16(a, 83);
 }
 
 __m256i test_mm256_sign_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sign_epi8
-  // CHECK: call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_sign_epi8(a, b);
 }
 
 __m256i test_mm256_sign_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sign_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_sign_epi16(a, b);
 }
 
 __m256i test_mm256_sign_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sign_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_sign_epi32(a, b);
 }
 
 __m256i test_mm256_slli_epi16(__m256i a) {
-  // CHECK-LABEL: test_mm256_slli_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
   return _mm256_slli_epi16(a, 3);
 }
 
 __m256i test_mm256_slli_epi16_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_slli_epi16_2
-  // CHECK: call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
   return _mm256_slli_epi16(a, b);
 }
 
 __m256i test_mm256_slli_epi32(__m256i a) {
-  // CHECK-LABEL: test_mm256_slli_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
   return _mm256_slli_epi32(a, 3);
 }
 
 __m256i test_mm256_slli_epi32_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_slli_epi32_2
-  // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
   return _mm256_slli_epi32(a, b);
 }
 
 __m256i test_mm256_slli_epi64(__m256i a) {
-  // CHECK-LABEL: test_mm256_slli_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
   return _mm256_slli_epi64(a, 3);
 }
 
 __m256i test_mm256_slli_epi64_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_slli_epi64_2
-  // CHECK: call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
   return _mm256_slli_epi64(a, b);
 }
 
 __m256i test_mm256_slli_si256(__m256i a) {
-  // CHECK-LABEL: test_mm256_slli_si256
-  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   return _mm256_slli_si256(a, 3);
 }
 
 __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_sllv_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sllv_epi32(a, b);
 }
 
 __m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sllv_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_sllv_epi32(a, b);
 }
 
 __m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_sllv_epi64
-  // CHECK: call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sllv_epi64(a, b);
 }
 
 __m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sllv_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_sllv_epi64(a, b);
 }
 
 __m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
-  // CHECK-LABEL: test_mm256_sra_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm256_sra_epi16(a, b);
 }
 
 __m256i test_mm256_sra_epi32(__m256i a, __m128i b) {
-  // CHECK-LABEL: test_mm256_sra_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm256_sra_epi32(a, b);
 }
 
 __m256i test_mm256_srai_epi16(__m256i a) {
-  // CHECK-LABEL: test_mm256_srai_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %{{.*}}, i32 %{{.*}})
   return _mm256_srai_epi16(a, 3);
 }
 
 __m256i test_mm256_srai_epi16_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_srai_epi16_2
-  // CHECK: call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %{{.*}}, i32 %{{.*}})
   return _mm256_srai_epi16(a, b);
 }
 
 __m256i test_mm256_srai_epi32(__m256i a) {
-  // CHECK-LABEL: test_mm256_srai_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %{{.*}}, i32 %{{.*}})
   return _mm256_srai_epi32(a, 3);
 }
 
 __m256i test_mm256_srai_epi32_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_srai_epi32_2
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %{{.*}}, i32 %{{.*}})
   return _mm256_srai_epi32(a, b);
 }
 
 __m128i test_mm_srav_epi32(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_srav_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_srav_epi32(a, b);
 }
 
 __m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_srav_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_srav_epi32(a, b);
 }
 
 __m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
-  // CHECK-LABEL: test_mm256_srl_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm256_srl_epi16(a, b);
 }
 
 __m256i test_mm256_srl_epi32(__m256i a, __m128i b) {
-  // CHECK-LABEL: test_mm256_srl_epi32
-  // CHECK:call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm256_srl_epi32(a, b);
 }
 
 __m256i test_mm256_srl_epi64(__m256i a, __m128i b) {
-  // CHECK-LABEL: test_mm256_srl_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm256_srl_epi64(a, b);
 }
 
 __m256i test_mm256_srli_epi16(__m256i a) {
-  // CHECK-LABEL: test_mm256_srli_epi16
-  // CHECK: call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
   return _mm256_srli_epi16(a, 3);
 }
 
 __m256i test_mm256_srli_epi16_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_srli_epi16_2
-  // CHECK: call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
   return _mm256_srli_epi16(a, b);
 }
 
 __m256i test_mm256_srli_epi32(__m256i a) {
-  // CHECK-LABEL: test_mm256_srli_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
   return _mm256_srli_epi32(a, 3);
 }
 
 __m256i test_mm256_srli_epi32_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_srli_epi32_2
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
   return _mm256_srli_epi32(a, b);
 }
 
 __m256i test_mm256_srli_epi64(__m256i a) {
-  // CHECK-LABEL: test_mm256_srli_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
   return _mm256_srli_epi64(a, 3);
 }
 
 __m256i test_mm256_srli_epi64_2(__m256i a, int b) {
-  // CHECK-LABEL: test_mm256_srli_epi64_2
-  // CHECK: call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
   return _mm256_srli_epi64(a, b);
 }
 
 __m256i test_mm256_srli_si256(__m256i a) {
-  // CHECK-LABEL: test_mm256_srli_si256
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   return _mm256_srli_si256(a, 3);
 }
 
 __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_srlv_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_srlv_epi32(a, b);
 }
 
 __m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_srlv_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_srlv_epi32(a, b);
 }
 
 __m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_srlv_epi64
-  // CHECK: call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_srlv_epi64(a, b);
 }
 
 __m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_srlv_epi64
-  // CHECK: call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_srlv_epi64(a, b);
 }
 
 __m256i test_mm256_stream_load_si256(__m256i const *a) {
-  // CHECK-LABEL: test_mm256_stream_load_si256
-  // CHECK: load <4 x i64>, ptr %{{.*}}, align 32, !nontemporal
   return _mm256_stream_load_si256(a);
 }
 
 __m256i test_mm256_stream_load_si256_void(const void *a) {
-  // CHECK-LABEL: test_mm256_stream_load_si256_void
-  // CHECK: load <4 x i64>, ptr %{{.*}}, align 32, !nontemporal
   return _mm256_stream_load_si256(a);
 }
 
 __m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sub_epi8
-  // CHECK: sub <32 x i8>
   return _mm256_sub_epi8(a, b);
 }
 
 __m256i test_mm256_sub_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sub_epi16
-  // CHECK: sub <16 x i16>
   return _mm256_sub_epi16(a, b);
 }
 
 __m256i test_mm256_sub_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sub_epi32
-  // CHECK: sub <8 x i32>
   return _mm256_sub_epi32(a, b);
 }
 
 __m256i test_mm256_sub_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_sub_epi64
-  // CHECK: sub <4 x i64>
   return _mm256_sub_epi64(a, b);
 }
 
 __m256i test_mm256_subs_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_subs_epi8
-  // CHECK: call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_subs_epi8(a, b);
 }
 
 __m256i test_mm256_subs_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_subs_epi16
-  // CHECK: call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_subs_epi16(a, b);
 }
 
 __m256i test_mm256_subs_epu8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_subs_epu8
-  // CHECK-NOT: call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
-  // CHECK: call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_subs_epu8(a, b);
 }
 
 __m256i test_mm256_subs_epu16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_subs_epu16
-  // CHECK-NOT: call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
-  // CHECK: call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_subs_epu16(a, b);
 }
 
 __m256i test_mm256_unpackhi_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpackhi_epi8
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   return _mm256_unpackhi_epi8(a, b);
 }
 
 __m256i test_mm256_unpackhi_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpackhi_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   return _mm256_unpackhi_epi16(a, b);
 }
 
 __m256i test_mm256_unpackhi_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpackhi_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   return _mm256_unpackhi_epi32(a, b);
 }
 
 __m256i test_mm256_unpackhi_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpackhi_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   return _mm256_unpackhi_epi64(a, b);
 }
 
 __m256i test_mm256_unpacklo_epi8(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpacklo_epi8
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   return _mm256_unpacklo_epi8(a, b);
 }
 
 __m256i test_mm256_unpacklo_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpacklo_epi16
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   return _mm256_unpacklo_epi16(a, b);
 }
 
 __m256i test_mm256_unpacklo_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpacklo_epi32
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   return _mm256_unpacklo_epi32(a, b);
 }
 
 __m256i test_mm256_unpacklo_epi64(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_unpacklo_epi64
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   return _mm256_unpacklo_epi64(a, b);
 }
 
 __m256i test_mm256_xor_si256(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_xor_si256
-  // CHECK: xor <4 x i64>
   return _mm256_xor_si256(a, b);
 }
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
+// X64: {{.*}}
+// X86: {{.*}}
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 0e3463849951ed..b504cd2a2511a1 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -1,10126 +1,7198 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512f -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
 // RUN: %clang_cc1 -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +avx512f -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s
 
 #include <immintrin.h>
 
+//
 __m512d test_mm512_sqrt_pd(__m512d a)
 {
-  // CHECK-LABEL: @test_mm512_sqrt_pd
-  // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}})
   return _mm512_sqrt_pd(a);
 }
 
+//
 __m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_sqrt_pd 
-  // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_sqrt_pd (__W,__U,__A);
 }
 
+//
 __m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_sqrt_pd 
-  // CHECK: call <8 x double> @llvm.sqrt.v8f64(<8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}}
   return _mm512_maskz_sqrt_pd (__U,__A);
 }
 
+//
 __m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_sqrt_round_pd
-  // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_sqrt_round_pd(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_sqrt_round_pd(__mmask8 __U,__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_pd
-  // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> {{.*}}
   return _mm512_maskz_sqrt_round_pd(__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_sqrt_round_pd(__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_sqrt_round_pd
-  // CHECK: call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %{{.*}}, i32 11)
   return _mm512_sqrt_round_pd(__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_sqrt_ps(__m512 a)
 {
-  // CHECK-LABEL: @test_mm512_sqrt_ps
-  // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}})
   return _mm512_sqrt_ps(a);
 }
 
+//
 __m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_sqrt_ps
-  // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_sqrt_ps( __W, __U, __A);
 }
 
+//
 __m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_sqrt_ps
-  // CHECK: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}}
   return _mm512_maskz_sqrt_ps(__U ,__A);
 }
 
+//
 __m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_sqrt_round_ps
-  // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_sqrt_round_ps(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_sqrt_round_ps(__mmask16 __U,__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ps
-  // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> {{.*}}
   return _mm512_maskz_sqrt_round_ps(__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_sqrt_round_ps(__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_sqrt_round_ps
-  // CHECK: call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %{{.*}}, i32 11)
   return _mm512_sqrt_round_ps(__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_rsqrt14_pd(__m512d a)
 {
-  // CHECK-LABEL: @test_mm512_rsqrt14_pd
-  // CHECK: @llvm.x86.avx512.rsqrt14.pd.512
   return _mm512_rsqrt14_pd(a);
 }
 
+//
 __m512d test_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt14_pd 
-  // CHECK: @llvm.x86.avx512.rsqrt14.pd.512
   return _mm512_mask_rsqrt14_pd (__W,__U,__A);
 }
 
+//
 __m512d test_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt14_pd 
-  // CHECK: @llvm.x86.avx512.rsqrt14.pd.512
   return _mm512_maskz_rsqrt14_pd (__U,__A);
 }
 
+//
 __m512 test_mm512_rsqrt14_ps(__m512 a)
 {
-  // CHECK-LABEL: @test_mm512_rsqrt14_ps
-  // CHECK: @llvm.x86.avx512.rsqrt14.ps.512
   return _mm512_rsqrt14_ps(a);
 }
 
+//
 __m512 test_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt14_ps 
-  // CHECK: @llvm.x86.avx512.rsqrt14.ps.512
   return _mm512_mask_rsqrt14_ps (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt14_ps 
-  // CHECK: @llvm.x86.avx512.rsqrt14.ps.512
   return _mm512_maskz_rsqrt14_ps (__U,__A);
 }
 
+//
 __m512 test_mm512_add_ps(__m512 a, __m512 b)
 {
-  // CHECK-LABEL: @test_mm512_add_ps
-  // CHECK: fadd <16 x float>
   return _mm512_add_ps(a, b);
 }
 
+//
 __m512d test_mm512_add_pd(__m512d a, __m512d b)
 {
-  // CHECK-LABEL: @test_mm512_add_pd
-  // CHECK: fadd <8 x double>
   return _mm512_add_pd(a, b);
 }
 
+//
 __m512 test_mm512_mul_ps(__m512 a, __m512 b)
 {
-  // CHECK-LABEL: @test_mm512_mul_ps
-  // CHECK: fmul <16 x float>
   return _mm512_mul_ps(a, b);
 }
 
+//
 __m512d test_mm512_mul_pd(__m512d a, __m512d b)
 {
-  // CHECK-LABEL: @test_mm512_mul_pd
-  // CHECK: fmul <8 x double>
   return _mm512_mul_pd(a, b);
 }
 
+//
 void test_mm512_storeu_si512 (void *__P, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_storeu_si512
-  // CHECK: store <8 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm512_storeu_si512 ( __P,__A);
 }
 
+//
 void test_mm512_storeu_ps(void *p, __m512 a)
 {
-  // CHECK-LABEL: @test_mm512_storeu_ps
-  // CHECK: store <16 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm512_storeu_ps(p, a);
 }
 
+//
 void test_mm512_storeu_pd(void *p, __m512d a)
 {
-  // CHECK-LABEL: @test_mm512_storeu_pd
-  // CHECK: store <8 x double> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm512_storeu_pd(p, a);
 }
 
+//
 void test_mm512_mask_store_ps(void *p, __m512 a, __mmask16 m)
 {
-  // CHECK-LABEL: @test_mm512_mask_store_ps
-  // CHECK: @llvm.masked.store.v16f32.p0(<16 x float> %{{.*}}, ptr %{{.*}}, i32 64, <16 x i1> %{{.*}})
   _mm512_mask_store_ps(p, m, a);
 }
 
+//
 void test_mm512_store_si512 (void *__P, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_store_si512 
-  // CHECK: load <8 x i64>, ptr %__A.addr.i, align 64{{$}}
-  // CHECK: [[SI512_3:%.+]] = load ptr, ptr %__P.addr.i, align 8{{$}}
-  // CHECK: store <8 x i64>  
   _mm512_store_si512 ( __P,__A);
 }
 
+//
 void test_mm512_store_epi32 (void *__P, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_store_epi32 
-  // CHECK: load <8 x i64>, ptr %__A.addr.i, align 64{{$}}
-  // CHECK: [[Si32_3:%.+]] = load ptr, ptr %__P.addr.i, align 8{{$}}
-  // CHECK: store <8 x i64>  
   _mm512_store_epi32 ( __P,__A);
 }
 
+//
 void test_mm512_store_epi64 (void *__P, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_store_epi64 
-  // CHECK: load <8 x i64>, ptr %__A.addr.i, align 64{{$}}
-  // CHECK: [[SI64_3:%.+]] = load ptr, ptr %__P.addr.i, align 8{{$}}
-  // CHECK: store <8 x i64>  
   _mm512_store_epi64 ( __P,__A);
 }
 
+//
 void test_mm512_store_ps(void *p, __m512 a)
 {
-  // CHECK-LABEL: @test_mm512_store_ps
-  // CHECK: store <16 x float>
   _mm512_store_ps(p, a);
 }
 
+//
 void test_mm512_store_pd(void *p, __m512d a)
 {
-  // CHECK-LABEL: @test_mm512_store_pd
-  // CHECK: store <8 x double>
   _mm512_store_pd(p, a);
 }
 
+//
 void test_mm512_mask_store_pd(void *p, __m512d a, __mmask8 m)
 {
-  // CHECK-LABEL: @test_mm512_mask_store_pd
-  // CHECK: @llvm.masked.store.v8f64.p0(<8 x double> %{{.*}}, ptr %{{.*}}, i32 64, <8 x i1> %{{.*}})
   _mm512_mask_store_pd(p, m, a);
 }
 
+//
 void test_mm512_storeu_epi32(void *__P, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_storeu_epi32
-  // CHECK: store <8 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  return _mm512_storeu_epi32(__P, __A); 
+  return _mm512_storeu_epi32(__P, __A);
 }
 
+//
 void test_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_storeu_epi32
-  // CHECK: @llvm.masked.store.v16i32.p0(<16 x i32> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %{{.*}})
-  return _mm512_mask_storeu_epi32(__P, __U, __A); 
+  return _mm512_mask_storeu_epi32(__P, __U, __A);
 }
 
+//
 void test_mm512_storeu_epi64(void *__P, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_storeu_epi64
-  // CHECK: store <8 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  return _mm512_storeu_epi64(__P, __A); 
+  return _mm512_storeu_epi64(__P, __A);
 }
 
+//
 void test_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_storeu_epi64
-  // CHECK: @llvm.masked.store.v8i64.p0(<8 x i64> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
-  return _mm512_mask_storeu_epi64(__P, __U, __A); 
+  return _mm512_mask_storeu_epi64(__P, __U, __A);
 }
 
+//
 __m512i test_mm512_loadu_si512 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_loadu_si512 
-  // CHECK: load <8 x i64>, ptr %{{.*}}, align 1{{$}}
   return _mm512_loadu_si512 ( __P);
 }
 
+//
 __m512i test_mm512_loadu_epi32 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_loadu_epi32 
-  // CHECK: load <8 x i64>, ptr %{{.*}}, align 1{{$}}
   return _mm512_loadu_epi32 (__P);
 }
 
+//
 __m512i test_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_mask_loadu_epi32 
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_mask_loadu_epi32 (__W,__U, __P);
 }
 
+//
 __m512i test_mm512_maskz_loadu_epi32 (__mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_loadu_epi32
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_maskz_loadu_epi32 (__U, __P);
 }
 
+//
 __m512i test_mm512_loadu_epi64 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_loadu_epi64 
-  // CHECK: load <8 x i64>, ptr %{{.*}}, align 1{{$}}
   return _mm512_loadu_epi64 (__P);
 }
 
+//
 __m512i test_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_mask_loadu_epi64 
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_mask_loadu_epi64 (__W,__U, __P);
 }
 
+//
 __m512i test_mm512_maskz_loadu_epi64 (__mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_loadu_epi64
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_maskz_loadu_epi64 (__U, __P);
 }
 
+//
 __m512 test_mm512_loadu_ps(void *p)
 {
-  // CHECK-LABEL: @test_mm512_loadu_ps
-  // CHECK: load <16 x float>, ptr {{.*}}, align 1{{$}}
   return _mm512_loadu_ps(p);
 }
 
+//
 __m512 test_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_mask_loadu_ps 
-  // CHECK: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_mask_loadu_ps (__W,__U, __P);
 }
 
+//
 __m512d test_mm512_loadu_pd(void *p)
 {
-  // CHECK-LABEL: @test_mm512_loadu_pd
-  // CHECK: load <8 x double>, ptr {{.*}}, align 1{{$}}
   return _mm512_loadu_pd(p);
 }
 
+//
 __m512d test_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_mask_loadu_pd 
-  // CHECK: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_mask_loadu_pd (__W,__U, __P);
 }
 
+//
 __m512i test_mm512_load_si512 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_load_si512 
-  // CHECK: [[LI512_1:%.+]] = load ptr, ptr %__P.addr.i, align 8{{$}}
-  // CHECK: load <8 x i64>, ptr [[LI512_1]], align 64{{$}}
   return _mm512_load_si512 ( __P);
 }
 
+//
 __m512i test_mm512_load_epi32 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_load_epi32 
-  // CHECK: [[LI32_1:%.+]] = load ptr, ptr %__P.addr.i, align 8{{$}}
-  // CHECK: load <8 x i64>, ptr [[LI32_1]], align 64{{$}}
   return _mm512_load_epi32 ( __P);
 }
 
+//
 __m512i test_mm512_load_epi64 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_load_epi64 
-  // CHECK: [[LI64_1:%.+]] = load ptr, ptr %__P.addr.i, align 8{{$}}
-  // CHECK: load <8 x i64>, ptr [[LI64_1]], align 64{{$}}
   return _mm512_load_epi64 ( __P);
 }
 
+//
 __m512 test_mm512_load_ps(void *p)
 {
-  // CHECK-LABEL: @test_mm512_load_ps
-  // CHECK: load <16 x float>, ptr %{{.*}}, align 64{{$}}
   return _mm512_load_ps(p);
 }
 
+//
 __m512 test_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_mask_load_ps 
-  // CHECK: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_mask_load_ps (__W,__U, __P);
 }
 
+//
 __m512 test_mm512_maskz_load_ps(__mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_load_ps
-  // CHECK: @llvm.masked.load.v16f32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_maskz_load_ps(__U, __P);
 }
 
+//
 __m512d test_mm512_load_pd(void *p)
 {
-  // CHECK-LABEL: @test_mm512_load_pd
-  // CHECK: load <8 x double>, ptr %{{.*}}, align 64{{$}}
   return _mm512_load_pd(p);
 }
 
+//
 __m512d test_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_mask_load_pd 
-  // CHECK: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_mask_load_pd (__W,__U, __P);
 }
 
+//
 __m512d test_mm512_maskz_load_pd(__mmask8 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_load_pd
-  // CHECK: @llvm.masked.load.v8f64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_maskz_load_pd(__U, __P);
 }
 
+//
 __m512d test_mm512_set1_pd(double d)
 {
-  // CHECK-LABEL: @test_mm512_set1_pd
-  // CHECK: insertelement <8 x double> {{.*}}, i32 0
-  // CHECK: insertelement <8 x double> {{.*}}, i32 1
-  // CHECK: insertelement <8 x double> {{.*}}, i32 2
-  // CHECK: insertelement <8 x double> {{.*}}, i32 3
-  // CHECK: insertelement <8 x double> {{.*}}, i32 4
-  // CHECK: insertelement <8 x double> {{.*}}, i32 5
-  // CHECK: insertelement <8 x double> {{.*}}, i32 6
-  // CHECK: insertelement <8 x double> {{.*}}, i32 7
   return _mm512_set1_pd(d);
 }
 
+//
 __mmask16 test_mm512_knot(__mmask16 a)
 {
-  // CHECK-LABEL: @test_mm512_knot
-  // CHECK: [[IN:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[NOT:%.*]] = xor <16 x i1> [[IN]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-  // CHECK: bitcast <16 x i1> [[NOT]] to i16
   return _mm512_knot(a);
 }
 
+//
 __m512i test_mm512_alignr_epi32(__m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_alignr_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   return _mm512_alignr_epi32(a, b, 2);
 }
 
+//
 __m512i test_mm512_mask_alignr_epi32(__m512i w, __mmask16 u, __m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_mask_alignr_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> {{.*}}
   return _mm512_mask_alignr_epi32(w, u, a, b, 2);
 }
 
+//
 __m512i test_mm512_maskz_alignr_epi32( __mmask16 u, __m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_maskz_alignr_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> {{.*}}
   return _mm512_maskz_alignr_epi32(u, a, b, 2);
 }
 
+//
 __m512i test_mm512_alignr_epi64(__m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_alignr_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
   return _mm512_alignr_epi64(a, b, 2);
 }
 
+//
 __m512i test_mm512_mask_alignr_epi64(__m512i w, __mmask8 u, __m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_mask_alignr_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> {{.*}}
   return _mm512_mask_alignr_epi64(w, u, a, b, 2);
 }
 
+//
 __m512i test_mm512_maskz_alignr_epi64( __mmask8 u, __m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_maskz_alignr_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> {{.*}}
   return _mm512_maskz_alignr_epi64(u, a, b, 2);
 }
 
+//
 __m512d test_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_mask_fmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask3_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_fmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pd
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmsub_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_fmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsub_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_fmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsub_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fnmadd_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fnmadd_round_pd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask3_fnmadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_fnmadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fnmsub_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
   return _mm512_fnmsub_round_pd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_fnmsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_pd
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fmadd_pd(__A, __B, __C);
 }
+//
 __m512d test_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_pd
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmadd_pd(__A, __U, __B, __C);
 }
+//
 __m512d test_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_pd
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmadd_pd(__A, __B, __C, __U);
 }
+//
 __m512d test_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_pd
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmadd_pd(__U, __A, __B, __C);
 }
+//
 __m512d test_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fmsub_pd(__A, __B, __C);
 }
+//
 __m512d test_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsub_pd(__A, __U, __B, __C);
 }
+//
 __m512d test_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsub_pd(__U, __A, __B, __C);
 }
+//
 __m512d test_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fnmadd_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fnmadd_pd(__A, __B, __C);
 }
+//
 __m512d test_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmadd_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmadd_pd(__A, __B, __C, __U);
 }
+//
 __m512d test_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmadd_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmadd_pd(__U, __A, __B, __C);
 }
+//
 __m512d test_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fnmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
   return _mm512_fnmsub_pd(__A, __B, __C);
 }
+//
 __m512d test_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fnmsub_pd(__U, __A, __B, __C);
 }
+//
 __m512 test_mm512_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_fmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask3_fmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_fmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ps
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_fmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_fmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fnmadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fnmadd_round_ps(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask3_fnmadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_fnmadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fnmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
   return _mm512_fnmsub_round_ps(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_fnmsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_ps
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fmadd_ps(__A, __B, __C);
 }
+//
 __m512 test_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_ps
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_mask_fmadd_ps(__A, __U, __B, __C);
 }
+//
 __m512 test_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_ps
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmadd_ps(__A, __B, __C, __U);
 }
+//
 __m512 test_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_ps
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmadd_ps(__U, __A, __B, __C);
 }
+//
 __m512 test_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fmsub_ps(__A, __B, __C);
 }
+//
 __m512 test_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsub_ps(__A, __U, __B, __C);
 }
+//
 __m512 test_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsub_ps(__U, __A, __B, __C);
 }
+//
 __m512 test_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fnmadd_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fnmadd_ps(__A, __B, __C);
 }
+//
 __m512 test_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmadd_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmadd_ps(__A, __B, __C, __U);
 }
+//
 __m512 test_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmadd_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmadd_ps(__U, __A, __B, __C);
 }
+//
 __m512 test_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fnmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
   return _mm512_fnmsub_ps(__A, __B, __C);
 }
+//
 __m512 test_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fnmsub_ps(__U, __A, __B, __C);
 }
+//
 __m512d test_mm512_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
   return _mm512_fmaddsub_round_pd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_fmaddsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmaddsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask3_fmaddsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmaddsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_fmaddsub_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_pd
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmaddsub_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmsubadd_round_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
   return _mm512_fmsubadd_round_pd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_fmsubadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsubadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_fmsubadd_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsubadd_round_pd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmaddsub_pd
-  // CHECK-NOT: fneg
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
   return _mm512_fmaddsub_pd(__A, __B, __C);
 }
+//
 __m512d test_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmaddsub_pd
-  // CHECK-NOT: fneg
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmaddsub_pd(__A, __U, __B, __C);
 }
+//
 __m512d test_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmaddsub_pd
-  // CHECK-NOT: fneg
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmaddsub_pd(__A, __B, __C, __U);
 }
+//
 __m512d test_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmaddsub_pd
-  // CHECK-NOT: fneg
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}, i32 4)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmaddsub_pd(__U, __A, __B, __C);
 }
+//
 __m512d test_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_fmsubadd_pd
-  // CHECK: [[NEG:%.+]] = fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]], i32 4)
   return _mm512_fmsubadd_pd(__A, __B, __C);
 }
+//
 __m512d test_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsubadd_pd
-  // CHECK: [[NEG:%.+]] = fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]], i32 4)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fmsubadd_pd(__A, __U, __B, __C);
 }
+//
 __m512d test_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsubadd_pd
-  // CHECK: [[NEG:%.+]] = fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]], i32 4)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> zeroinitializer
   return _mm512_maskz_fmsubadd_pd(__U, __A, __B, __C);
 }
+//
 __m512 test_mm512_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
   return _mm512_fmaddsub_round_ps(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_fmaddsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmaddsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask3_fmaddsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmaddsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_fmaddsub_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ps
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmaddsub_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmsubadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
   return _mm512_fmsubadd_round_ps(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_fmsubadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsubadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_fmsubadd_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsubadd_round_ps(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmaddsub_ps
-  // CHECK-NOT: fneg
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
   return _mm512_fmaddsub_ps(__A, __B, __C);
 }
+//
 __m512 test_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmaddsub_ps
-  // CHECK-NOT: fneg
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmaddsub_ps(__A, __U, __B, __C);
 }
+//
 __m512 test_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmaddsub_ps
-  // CHECK-NOT: fneg
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmaddsub_ps(__A, __B, __C, __U);
 }
+//
 __m512 test_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmaddsub_ps
-  // CHECK-NOT: fneg
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}, i32 4)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmaddsub_ps(__U, __A, __B, __C);
 }
+//
 __m512 test_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_fmsubadd_ps
-  // CHECK: [[NEG:%.+]] = fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]], i32 4)
   return _mm512_fmsubadd_ps(__A, __B, __C);
 }
+//
 __m512 test_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsubadd_ps
-  // CHECK: [[NEG:%.+]] = fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]], i32 4)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmsubadd_ps(__A, __U, __B, __C);
 }
+//
 __m512 test_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsubadd_ps
-  // CHECK: [[NEG:%.+]] = fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]], i32 4)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> zeroinitializer
   return _mm512_maskz_fmsubadd_ps(__U, __A, __B, __C);
 }
+//
 __m512d test_mm512_mask3_fmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsub_round_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsub_pd(__A, __B, __C, __U);
 }
+//
 __m512 test_mm512_mask3_fmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsub_ps(__A, __B, __C, __U);
 }
+//
 __m512d test_mm512_mask3_fmsubadd_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsubadd_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsubadd_pd
-  // CHECK: [[NEG:%.+]] = fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> [[NEG]], i32 4)
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fmsubadd_pd(__A, __B, __C, __U);
 }
+//
 __m512 test_mm512_mask3_fmsubadd_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmaddsub.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsubadd_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsubadd_ps
-  // CHECK: [[NEG:%.+]] = fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> [[NEG]], i32 4)
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmsubadd_ps(__A, __B, __C, __U);
 }
+//
 __m512d test_mm512_mask_fnmadd_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmadd_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmadd_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmadd_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmadd_pd(__A, __U, __B, __C);
 }
+//
 __m512 test_mm512_mask_fnmadd_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmadd_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmadd_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmadd_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmadd_ps(__A, __U, __B, __C);
 }
+//
 __m512d test_mm512_mask_fnmsub_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmsub_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmsub_round_pd(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask3_fnmsub_round_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_pd
-  // CHECK: fneg <8 x double>
-  // CHECK: fneg <8 x double>
-  // CHECK: @llvm.x86.avx512.vfmadd.pd.512
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmsub_round_pd(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_fnmsub_pd(__A, __U, __B, __C);
 }
+//
 __m512d test_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmsub_pd
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: fneg <8 x double> %{{.*}}
-  // CHECK: call <8 x double> @llvm.fma.v8f64(<8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}})
-  // CHECK: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask3_fnmsub_pd(__A, __B, __C, __U);
 }
+//
 __m512 test_mm512_mask_fnmsub_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmsub_round_ps(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask3_fnmsub_round_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: @llvm.x86.avx512.vfmadd.ps.512
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmsub_round_ps(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fnmsub_ps(__A, __U, __B, __C);
 }
+//
 __m512 test_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmsub_ps
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: fneg <16 x float> %{{.*}}
-  // CHECK: call <16 x float> @llvm.fma.v16f32(<16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}})
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fnmsub_ps(__A, __B, __C, __U);
 }
 
+//
 __mmask16 test_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epi32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpeq_epi32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epi32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpeq_epi32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epi64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpeq_epi64_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epi64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpeq_epi64_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epi32_mask
-  // CHECK: icmp sgt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpgt_epi32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epi32_mask
-  // CHECK: icmp sgt <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpgt_epi32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epi64_mask
-  // CHECK: icmp sgt <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpgt_epi64_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epi64_mask
-  // CHECK: icmp sgt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpgt_epi64_mask(__a, __b);
 }
 
+//
 __m512d test_mm512_unpackhi_pd(__m512d a, __m512d b)
 {
-  // CHECK-LABEL: @test_mm512_unpackhi_pd
-  // CHECK: shufflevector <8 x double> {{.*}} <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   return _mm512_unpackhi_pd(a, b);
 }
 
+//
 __m512d test_mm512_unpacklo_pd(__m512d a, __m512d b)
 {
-  // CHECK-LABEL: @test_mm512_unpacklo_pd
-  // CHECK: shufflevector <8 x double> {{.*}} <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   return _mm512_unpacklo_pd(a, b);
 }
 
+//
 __m512 test_mm512_unpackhi_ps(__m512 a, __m512 b)
 {
-  // CHECK-LABEL: @test_mm512_unpackhi_ps
-  // CHECK: shufflevector <16 x float> {{.*}} <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   return _mm512_unpackhi_ps(a, b);
 }
 
+//
 __m512 test_mm512_unpacklo_ps(__m512 a, __m512 b)
 {
-  // CHECK-LABEL: @test_mm512_unpacklo_ps
-  // CHECK: shufflevector <16 x float> {{.*}} <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   return _mm512_unpacklo_ps(a, b);
 }
 
+//
 __mmask16 test_mm512_cmp_round_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmp_round_ps_mask
-  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_round_ps_mask(a, b, _CMP_EQ_OQ, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_round_ps_mask(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_round_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_round_ps_mask(m, a, b, _CMP_EQ_OQ, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_eq_oq(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmp_ps_mask_eq_oq
-  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_lt_os(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_lt_os
-  // CHECK: fcmp olt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_LT_OS);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_le_os(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_le_os
-  // CHECK: fcmp ole <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_LE_OS);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_unord_q(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_unord_q
-  // CHECK: fcmp uno <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_UNORD_Q);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_neq_uq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_uq
-  // CHECK: fcmp une <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_UQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_nlt_us(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_nlt_us
-  // CHECK: fcmp uge <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NLT_US);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_nle_us(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_nle_us
-  // CHECK: fcmp ugt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NLE_US);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_ord_q(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_ord_q
-  // CHECK: fcmp ord <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_ORD_Q);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_eq_uq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_eq_uq
-  // CHECK: fcmp ueq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_EQ_UQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_nge_us(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_nge_us
-  // CHECK: fcmp ult <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NGE_US);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_ngt_us(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_ngt_us
-  // CHECK: fcmp ule <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NGT_US);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_false_oq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_false_oq
-  // CHECK: fcmp false <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_FALSE_OQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_neq_oq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_oq
-  // CHECK: fcmp one <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_OQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_ge_os(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_ge_os
-  // CHECK: fcmp oge <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_GE_OS);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_gt_os(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_gt_os
-  // CHECK: fcmp ogt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_GT_OS);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_true_uq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_true_uq
-  // CHECK: fcmp true <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_TRUE_UQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_eq_os(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_eq_os
-  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_EQ_OS);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_lt_oq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_lt_oq
-  // CHECK: fcmp olt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_le_oq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_le_oq
-  // CHECK: fcmp ole <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_unord_s(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_unord_s
-  // CHECK: fcmp uno <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_UNORD_S);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_neq_us(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_us
-  // CHECK: fcmp une <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_US);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_nlt_uq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_nlt_uq
-  // CHECK: fcmp uge <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NLT_UQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_nle_uq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_nle_uq
-  // CHECK: fcmp ugt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NLE_UQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_ord_s(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_ord_s
-  // CHECK: fcmp ord <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_ORD_S);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_eq_us(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_eq_us
-  // CHECK: fcmp ueq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_EQ_US);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_nge_uq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_nge_uq
-  // CHECK: fcmp ult <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_ngt_uq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_ngt_uq
-  // CHECK: fcmp ule <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_false_os(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_false_os
-  // CHECK: fcmp false <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_FALSE_OS);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_neq_os(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_neq_os
-  // CHECK: fcmp one <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_NEQ_OS);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_ge_oq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_ge_oq
-  // CHECK: fcmp oge <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_gt_oq(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_gt_oq
-  // CHECK: fcmp ogt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ);
 }
 
+//
 __mmask16 test_mm512_cmp_ps_mask_true_us(__m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_cmp_ps_mask_true_us
-  // CHECK: fcmp true <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmp_ps_mask(a, b, _CMP_TRUE_US);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_eq_oq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_ps_mask_eq_oq
-  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_lt_os(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_lt_os
-  // CHECK: [[CMP:%.*]] = fcmp olt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OS);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_le_os(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_le_os
-  // CHECK: [[CMP:%.*]] = fcmp ole <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OS);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_unord_q(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_unord_q
-  // CHECK: [[CMP:%.*]] = fcmp uno <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_Q);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_neq_uq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_uq
-  // CHECK: [[CMP:%.*]] = fcmp une <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_UQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_nlt_us(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nlt_us
-  // CHECK: [[CMP:%.*]] = fcmp uge <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_US);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_nle_us(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nle_us
-  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_US);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_ord_q(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ord_q
-  // CHECK: [[CMP:%.*]] = fcmp ord <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_Q);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_eq_uq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_eq_uq
-  // CHECK: [[CMP:%.*]] = fcmp ueq <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_UQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_nge_us(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nge_us
-  // CHECK: [[CMP:%.*]] = fcmp ult <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGE_US);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_ngt_us(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ngt_us
-  // CHECK: [[CMP:%.*]] = fcmp ule <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGT_US);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_false_oq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_false_oq
-  // CHECK: [[CMP:%.*]] = fcmp false <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_neq_oq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_oq
-  // CHECK: [[CMP:%.*]] = fcmp one <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_ge_os(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ge_os
-  // CHECK: [[CMP:%.*]] = fcmp oge <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GE_OS);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_gt_os(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_gt_os
-  // CHECK: [[CMP:%.*]] = fcmp ogt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GT_OS);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_true_uq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_true_uq
-  // CHECK: [[CMP:%.*]] = fcmp true <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_UQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_eq_os(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_eq_os
-  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_OS);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_lt_oq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_lt_oq
-  // CHECK: [[CMP:%.*]] = fcmp olt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LT_OQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_le_oq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_le_oq
-  // CHECK: [[CMP:%.*]] = fcmp ole <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_LE_OQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_unord_s(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_unord_s
-  // CHECK: [[CMP:%.*]] = fcmp uno <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_UNORD_S);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_neq_us(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_us
-  // CHECK: [[CMP:%.*]] = fcmp une <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_US);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_nlt_uq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nlt_uq
-  // CHECK: [[CMP:%.*]] = fcmp uge <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLT_UQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_nle_uq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nle_uq
-  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NLE_UQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_ord_s(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ord_s
-  // CHECK: [[CMP:%.*]] = fcmp ord <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_ORD_S);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_eq_us(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_eq_us
-  // CHECK: [[CMP:%.*]] = fcmp ueq <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_EQ_US);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_nge_uq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_nge_uq
-  // CHECK: [[CMP:%.*]] = fcmp ult <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGE_UQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_ngt_uq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ngt_uq
-  // CHECK: [[CMP:%.*]] = fcmp ule <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NGT_UQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_false_os(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_false_os
-  // CHECK: [[CMP:%.*]] = fcmp false <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_FALSE_OS);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_neq_os(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_neq_os
-  // CHECK: [[CMP:%.*]] = fcmp one <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_NEQ_OS);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_ge_oq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_ge_oq
-  // CHECK: [[CMP:%.*]] = fcmp oge <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GE_OQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_gt_oq(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_gt_oq
-  // CHECK: [[CMP:%.*]] = fcmp ogt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_GT_OQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_ps_mask_true_us(__mmask16 m, __m512 a, __m512 b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ps_mask_true_us
-  // CHECK: [[CMP:%.*]] = fcmp true <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ps_mask(m, a, b, _CMP_TRUE_US);
 }
 
+//
 __mmask8 test_mm512_cmp_round_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmp_round_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_round_pd_mask(a, b, _CMP_EQ_OQ, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_round_pd_mask(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_round_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_round_pd_mask(m, a, b, _CMP_EQ_OQ, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_eq_oq(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmp_pd_mask_eq_oq
-  // CHECK: fcmp oeq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_lt_os(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_lt_os
-  // CHECK: fcmp olt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_LT_OS);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_le_os(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_le_os
-  // CHECK: fcmp ole <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_LE_OS);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_unord_q(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_unord_q
-  // CHECK: fcmp uno <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_UNORD_Q);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_neq_uq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_uq
-  // CHECK: fcmp une <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_UQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_nlt_us(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_nlt_us
-  // CHECK: fcmp uge <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NLT_US);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_nle_us(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_nle_us
-  // CHECK: fcmp ugt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NLE_US);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_ord_q(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_ord_q
-  // CHECK: fcmp ord <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_ORD_Q);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_eq_uq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_eq_uq
-  // CHECK: fcmp ueq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_EQ_UQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_nge_us(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_nge_us
-  // CHECK: fcmp ult <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NGE_US);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_ngt_us(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_ngt_us
-  // CHECK: fcmp ule <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NGT_US);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_false_oq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_false_oq
-  // CHECK: fcmp false <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_FALSE_OQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_neq_oq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_oq
-  // CHECK: fcmp one <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_OQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_ge_os(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_ge_os
-  // CHECK: fcmp oge <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_GE_OS);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_gt_os(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_gt_os
-  // CHECK: fcmp ogt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_GT_OS);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_true_uq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_true_uq
-  // CHECK: fcmp true <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_TRUE_UQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_eq_os(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_eq_os
-  // CHECK: fcmp oeq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_EQ_OS);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_lt_oq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_lt_oq
-  // CHECK: fcmp olt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_le_oq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_le_oq
-  // CHECK: fcmp ole <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_unord_s(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_unord_s
-  // CHECK: fcmp uno <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_UNORD_S);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_neq_us(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_us
-  // CHECK: fcmp une <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_US);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_nlt_uq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_nlt_uq
-  // CHECK: fcmp uge <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NLT_UQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_nle_uq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_nle_uq
-  // CHECK: fcmp ugt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NLE_UQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_ord_s(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_ord_s
-  // CHECK: fcmp ord <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_ORD_S);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_eq_us(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_eq_us
-  // CHECK: fcmp ueq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_EQ_US);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_nge_uq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_nge_uq
-  // CHECK: fcmp ult <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_ngt_uq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_ngt_uq
-  // CHECK: fcmp ule <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_false_os(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_false_os
-  // CHECK: fcmp false <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_FALSE_OS);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_neq_os(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_neq_os
-  // CHECK: fcmp one <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_NEQ_OS);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_ge_oq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_ge_oq
-  // CHECK: fcmp oge <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_GE_OQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_gt_oq(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_gt_oq
-  // CHECK: fcmp ogt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_GT_OQ);
 }
 
+//
 __mmask8 test_mm512_cmp_pd_mask_true_us(__m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_cmp_pd_mask_true_us
-  // CHECK: fcmp true <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmp_pd_mask(a, b, _CMP_TRUE_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_eq_oq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_pd_mask_eq_oq
-  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_lt_os(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_lt_os
-  // CHECK: [[CMP:%.*]] = fcmp olt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OS);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_le_os(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_le_os
-  // CHECK: [[CMP:%.*]] = fcmp ole <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OS);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_unord_q(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_unord_q
-  // CHECK: [[CMP:%.*]] = fcmp uno <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_Q);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_neq_uq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_uq
-  // CHECK: [[CMP:%.*]] = fcmp une <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_UQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_nlt_us(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nlt_us
-  // CHECK: [[CMP:%.*]] = fcmp uge <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_nle_us(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nle_us
-  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_ord_q(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ord_q
-  // CHECK: [[CMP:%.*]] = fcmp ord <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_Q);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_eq_uq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_eq_uq
-  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_UQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_nge_us(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nge_us
-  // CHECK: [[CMP:%.*]] = fcmp ult <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGE_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_ngt_us(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ngt_us
-  // CHECK: [[CMP:%.*]] = fcmp ule <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGT_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_false_oq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_false_oq
-  // CHECK: [[CMP:%.*]] = fcmp false <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_neq_oq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_oq
-  // CHECK: [[CMP:%.*]] = fcmp one <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_ge_os(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ge_os
-  // CHECK: [[CMP:%.*]] = fcmp oge <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GE_OS);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_gt_os(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_gt_os
-  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GT_OS);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_true_uq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_true_uq
-  // CHECK: [[CMP:%.*]] = fcmp true <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_UQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_eq_os(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_eq_os
-  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_OS);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_lt_oq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_lt_oq
-  // CHECK: [[CMP:%.*]] = fcmp olt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LT_OQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_le_oq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_le_oq
-  // CHECK: [[CMP:%.*]] = fcmp ole <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_LE_OQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_unord_s(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_unord_s
-  // CHECK: [[CMP:%.*]] = fcmp uno <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_UNORD_S);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_neq_us(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_us
-  // CHECK: [[CMP:%.*]] = fcmp une <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_nlt_uq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nlt_uq
-  // CHECK: [[CMP:%.*]] = fcmp uge <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLT_UQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_nle_uq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nle_uq
-  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NLE_UQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_ord_s(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ord_s
-  // CHECK: [[CMP:%.*]] = fcmp ord <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_ORD_S);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_eq_us(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_eq_us
-  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_EQ_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_nge_uq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_nge_uq
-  // CHECK: [[CMP:%.*]] = fcmp ult <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGE_UQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_ngt_uq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ngt_uq
-  // CHECK: [[CMP:%.*]] = fcmp ule <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NGT_UQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_false_os(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_false_os
-  // CHECK: [[CMP:%.*]] = fcmp false <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_FALSE_OS);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_neq_os(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_neq_os
-  // CHECK: [[CMP:%.*]] = fcmp one <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_NEQ_OS);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_ge_oq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_ge_oq
-  // CHECK: [[CMP:%.*]] = fcmp oge <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GE_OQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_gt_oq(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_gt_oq
-  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_GT_OQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask_true_us(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_pd_mask_true_us
-  // CHECK: [[CMP:%.*]] = fcmp true <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, _CMP_TRUE_US);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_pd_mask(__mmask8 m, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_pd_mask(m, a, b, 0);
 }
 
+//
 __mmask8 test_mm512_cmpeq_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_pd_mask
-  // CHECK: fcmp oeq <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpeq_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmpeq_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_ps_mask
-  // CHECK: fcmp oeq <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpeq_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpeq_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpeq_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpeq_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpeq_ps_mask(k, a, b);
 }
 
+//
 __mmask8 test_mm512_cmple_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmple_pd_mask
-  // CHECK: fcmp ole <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmple_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmple_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmple_ps_mask
-  // CHECK: fcmp ole <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmple_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmple_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp ole <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmple_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmple_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp ole <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmple_ps_mask(k, a, b);
 }
 
+//
 __mmask8 test_mm512_cmplt_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmplt_pd_mask
-  // CHECK: fcmp olt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmplt_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmplt_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmplt_ps_mask
-  // CHECK: fcmp olt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmplt_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmplt_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp olt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmplt_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmplt_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp olt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmplt_ps_mask(k, a, b);
 }
 
+//
 __mmask8 test_mm512_cmpneq_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_pd_mask
-  // CHECK: fcmp une <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpneq_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmpneq_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_ps_mask
-  // CHECK: fcmp une <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpneq_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpneq_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp une <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpneq_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpneq_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp une <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpneq_ps_mask(k, a, b);
 }
 
+//
 __mmask8 test_mm512_cmpnle_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmpnle_pd_mask
-  // CHECK: fcmp ugt <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpnle_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmpnle_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmpnle_ps_mask
-  // CHECK: fcmp ugt <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpnle_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpnle_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpnle_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnle_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpnle_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpnle_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnle_ps_mask(k, a, b);
 }
 
+//
 __mmask8 test_mm512_cmpnlt_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmpnlt_pd_mask
-  // CHECK: fcmp uge <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpnlt_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmpnlt_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmpnlt_ps_mask
-  // CHECK: fcmp uge <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpnlt_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpnlt_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpnlt_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp uge <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnlt_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpnlt_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpnlt_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp uge <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpnlt_ps_mask(k, a, b);
 }
 
+//
 __mmask8 test_mm512_cmpord_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmpord_pd_mask
-  // CHECK: fcmp ord <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpord_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmpord_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmpord_ps_mask
-  // CHECK: fcmp ord <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpord_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpord_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpord_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp ord <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpord_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpord_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpord_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp ord <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpord_ps_mask(k, a, b);
 }
 
+//
 __mmask8 test_mm512_cmpunord_pd_mask(__m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_cmpunord_pd_mask
-  // CHECK: fcmp uno <8 x double> %{{.*}}, %{{.*}}
   return _mm512_cmpunord_pd_mask(a, b);
 }
 
+//
 __mmask16 test_mm512_cmpunord_ps_mask(__m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_cmpunord_ps_mask
-  // CHECK: fcmp uno <16 x float> %{{.*}}, %{{.*}}
   return _mm512_cmpunord_ps_mask(a, b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpunord_pd_mask(__mmask8 k, __m512d a, __m512d b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpunord_pd_mask
-  // CHECK: [[CMP:%.*]] = fcmp uno <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpunord_pd_mask(k, a, b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpunord_ps_mask(__mmask16 k, __m512 a, __m512 b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpunord_ps_mask
-  // CHECK: [[CMP:%.*]] = fcmp uno <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmpunord_ps_mask(k, a, b);
 }
 
+//
 __m256d test_mm512_extractf64x4_pd(__m512d a)
 {
-  // CHECK-LABEL: @test_mm512_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf64x4_pd(a, 1);
 }
 
+//
 __m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
-  // CHECK-LABEL:@test_mm512_mask_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
 }
 
+//
 __m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
-  // CHECK-LABEL:@test_mm512_maskz_extractf64x4_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
 }
 
+//
 __m128 test_mm512_extractf32x4_ps(__m512 a)
 {
-  // CHECK-LABEL: @test_mm512_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return _mm512_extractf32x4_ps(a, 1);
 }
 
+//
 __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512 __A){
-  // CHECK-LABEL:@test_mm512_mask_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
 }
 
+//
 __m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512 __A){
-  // CHECK-LABEL:@test_mm512_maskz_extractf32x4_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm512_maskz_extractf32x4_ps(__U, __A, 1);
 }
 
+//
 __mmask16 test_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epu32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpeq_epu32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epu32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpeq_epu32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpeq_epu64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpeq_epu64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpeq_epu64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpeq_epu64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epi32_mask
-  // CHECK: icmp sge <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpge_epi32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epi32_mask
-  // CHECK: icmp sge <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpge_epi32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epi64_mask
-  // CHECK: icmp sge <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpge_epi64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epi64_mask
-  // CHECK: icmp sge <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpge_epi64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epu32_mask
-  // CHECK: icmp uge <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpge_epu32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epu32_mask
-  // CHECK: icmp uge <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpge_epu32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpge_epu64_mask
-  // CHECK: icmp uge <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpge_epu64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpge_epu64_mask
-  // CHECK: icmp uge <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpge_epu64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epu32_mask
-  // CHECK: icmp ugt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpgt_epu32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epu32_mask
-  // CHECK: icmp ugt <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpgt_epu32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpgt_epu64_mask
-  // CHECK: icmp ugt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpgt_epu64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpgt_epu64_mask
-  // CHECK: icmp ugt <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpgt_epu64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epi32_mask
-  // CHECK: icmp sle <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmple_epi32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epi32_mask
-  // CHECK: icmp sle <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmple_epi32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epi64_mask
-  // CHECK: icmp sle <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmple_epi64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epi64_mask
-  // CHECK: icmp sle <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmple_epi64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epu32_mask
-  // CHECK: icmp ule <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmple_epu32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epu32_mask
-  // CHECK: icmp ule <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmple_epu32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmple_epu64_mask
-  // CHECK: icmp ule <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmple_epu64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmple_epu64_mask
-  // CHECK: icmp ule <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmple_epu64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epi32_mask
-  // CHECK: icmp slt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmplt_epi32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epi32_mask
-  // CHECK: icmp slt <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmplt_epi32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epi64_mask
-  // CHECK: icmp slt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmplt_epi64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epi64_mask
-  // CHECK: icmp slt <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmplt_epi64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epu32_mask
-  // CHECK: icmp ult <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmplt_epu32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epu32_mask
-  // CHECK: icmp ult <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmplt_epu32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmplt_epu64_mask
-  // CHECK: icmp ult <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmplt_epu64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmplt_epu64_mask
-  // CHECK: icmp ult <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmplt_epu64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epi32_mask
-  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpneq_epi32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epi32_mask
-  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpneq_epi32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epi64_mask
-  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpneq_epi64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epi64_mask
-  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpneq_epi64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epu32_mask
-  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpneq_epu32_mask(__a, __b);
 }
 
+//
 __mmask16 test_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epu32_mask
-  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpneq_epu32_mask(__u, __a, __b);
 }
 
+//
 __mmask8 test_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmpneq_epu64_mask
-  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpneq_epu64_mask(__a, __b);
 }
 
+//
 __mmask8 test_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmpneq_epu64_mask
-  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
+//
 __mmask16 test_mm512_cmp_eq_epi32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_eq_epi32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_eq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_eq_epi32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmp_epi32_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
+//
 __mmask8 test_mm512_cmp_eq_epi64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_eq_epi64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmp_epi64_mask(__a, __b, _MM_CMPINT_EQ);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_eq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_eq_epi64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmp_epi64_mask(__u, __a, __b, _MM_CMPINT_EQ);
 }
 
+//
 __mmask16 test_mm512_cmp_epu32_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epu32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmp_epu32_mask(__a, __b, 0);
 }
 
+//
 __mmask16 test_mm512_mask_cmp_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epu32_mask
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmp_epu32_mask(__u, __a, __b, 0);
 }
 
+//
 __mmask8 test_mm512_cmp_epu64_mask(__m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_cmp_epu64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmp_epu64_mask(__a, __b, 0);
 }
 
+//
 __mmask8 test_mm512_mask_cmp_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_epu64_mask
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmp_epu64_mask(__u, __a, __b, 0);
 }
 
+//
 __m512i test_mm512_mask_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_and_epi32
-  // CHECK: and <16 x i32> 
-  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_and_epi32(__src, __k,__a, __b);
 }
 
+//
 __m512i test_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_maskz_and_epi32
-  // CHECK: and <16 x i32> 
-  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_and_epi32(__k,__a, __b);
 }
 
+//
 __m512i test_mm512_mask_and_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_and_epi64
-  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
-  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_and_epi64(__src, __k,__a, __b);
 }
 
+//
 __m512i test_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_maskz_and_epi64
-  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
-  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_and_epi64(__k,__a, __b);
 }
 
+//
 __m512i test_mm512_mask_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_or_epi32
-  // CHECK: or <16 x i32> 
-  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_or_epi32(__src, __k,__a, __b);
 }
 
+//
 __m512i test_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_maskz_or_epi32
-  // CHECK: or <16 x i32> 
-  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_or_epi32(__k,__a, __b);
 }
 
+//
 __m512i test_mm512_mask_or_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_or_epi64
-  // CHECK: %[[OR_RES:.*]] = or <8 x i64>
-  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_or_epi64(__src, __k,__a, __b);
 }
 
+//
 __m512i test_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_maskz_or_epi64
-  // CHECK: %[[OR_RES:.*]] = or <8 x i64>
-  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_or_epi64(__k,__a, __b);
 }
 
+//
 __m512i test_mm512_mask_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_xor_epi32
-  // CHECK: xor <16 x i32> 
-  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_xor_epi32(__src, __k,__a, __b);
 }
 
+//
 __m512i test_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_maskz_xor_epi32
-  // CHECK: xor <16 x i32> 
-  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_xor_epi32(__k,__a, __b);
 }
 
+//
 __m512i test_mm512_mask_xor_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_mask_xor_epi64
-  // CHECK: %[[XOR_RES:.*]] = xor <8 x i64>
-  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_xor_epi64(__src, __k,__a, __b);
 }
 
+//
 __m512i test_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_maskz_xor_epi64
-  // CHECK: %[[XOR_RES:.*]] = xor <8 x i64>
-  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_xor_epi64(__k,__a, __b);
 }
 
+//
 __m512i test_mm512_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_and_epi32
-  // CHECK: and <16 x i32>
   return _mm512_and_epi32(__a, __b);
 }
 
+//
 __m512i test_mm512_and_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_and_epi64
-  // CHECK: and <8 x i64>
   return _mm512_and_epi64(__a, __b);
 }
 
+//
 __m512i test_mm512_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_or_epi32
-  // CHECK: or <16 x i32>
   return _mm512_or_epi32(__a, __b);
 }
 
+//
 __m512i test_mm512_or_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_or_epi64
-  // CHECK: or <8 x i64>
   return _mm512_or_epi64(__a, __b);
 }
 
+//
 __m512i test_mm512_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_xor_epi32
-  // CHECK: xor <16 x i32>
   return _mm512_xor_epi32(__a, __b);
 }
 
+//
 __m512i test_mm512_xor_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
-  // CHECK-LABEL: @test_mm512_xor_epi64
-  // CHECK: xor <8 x i64>
   return _mm512_xor_epi64(__a, __b);
 }
 
+//
 __m512i test_mm512_maskz_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B){
-  // CHECK-LABEL: @test_mm512_maskz_andnot_epi32
-  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_andnot_epi32(__k,__A,__B);
 }
 
+//
 __m512i test_mm512_mask_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
                                       __m512i __src) {
-  // CHECK-LABEL: @test_mm512_mask_andnot_epi32
-  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_andnot_epi32(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_andnot_si512(__m512i __A, __m512i __B)
 {
-  //CHECK-LABEL: @test_mm512_andnot_si512
-  //CHECK: load {{.*}}%__A.addr.i, align 64
-  //CHECK: %not.i = xor{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-  //CHECK: load {{.*}}%__B.addr.i, align 64
-  //CHECK: and <8 x i64> %not.i,{{.*}}
 
   return _mm512_andnot_si512(__A, __B);
 }
 
+//
 __m512i test_mm512_andnot_epi32(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_andnot_epi32
-  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   return _mm512_andnot_epi32(__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_andnot_epi64
-  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_andnot_epi64(__k,__A,__B);
 }
 
-__m512i test_mm512_mask_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B, 
+//
+__m512i test_mm512_mask_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B,
                                       __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_andnot_epi64
-  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_andnot_epi64(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_andnot_epi64(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_andnot_epi64
-  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
-  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_andnot_epi64(__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_sub_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_sub_epi32
-  //CHECK: sub <16 x i32> %{{.*}}, %{{.*}}
-  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_sub_epi32(__k,__A,__B);
 }
 
-__m512i test_mm512_mask_sub_epi32 (__mmask16 __k,__m512i __A, __m512i __B, 
+//
+__m512i test_mm512_mask_sub_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
                                    __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_sub_epi32
-  //CHECK: sub <16 x i32> %{{.*}}, %{{.*}}
-  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_sub_epi32(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_sub_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_sub_epi32
-  //CHECK: sub <16 x i32>
   return _mm512_sub_epi32(__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_sub_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_sub_epi64
-  //CHECK: sub <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_sub_epi64(__k,__A,__B);
 }
 
-__m512i test_mm512_mask_sub_epi64 (__mmask8 __k,__m512i __A, __m512i __B, 
+//
+__m512i test_mm512_mask_sub_epi64 (__mmask8 __k,__m512i __A, __m512i __B,
                                    __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_sub_epi64
-  //CHECK: sub <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_sub_epi64(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_sub_epi64(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_sub_epi64
-  //CHECK: sub <8 x i64>
   return _mm512_sub_epi64(__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_add_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_add_epi32
-  //CHECK: add <16 x i32> %{{.*}}, %{{.*}}
-  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_add_epi32(__k,__A,__B);
 }
 
-__m512i test_mm512_mask_add_epi32 (__mmask16 __k,__m512i __A, __m512i __B, 
+//
+__m512i test_mm512_mask_add_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
                                    __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_add_epi32
-  //CHECK: add <16 x i32> %{{.*}}, %{{.*}}
-  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_add_epi32(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_add_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_add_epi32
-  //CHECK: add <16 x i32>
   return _mm512_add_epi32(__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_add_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_add_epi64
-  //CHECK: add <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_add_epi64(__k,__A,__B);
 }
 
-__m512i test_mm512_mask_add_epi64 (__mmask8 __k,__m512i __A, __m512i __B, 
+//
+__m512i test_mm512_mask_add_epi64 (__mmask8 __k,__m512i __A, __m512i __B,
                                    __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_add_epi64
-  //CHECK: add <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_add_epi64(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_add_epi64(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_add_epi64
-  //CHECK: add <8 x i64>
   return _mm512_add_epi64(__A,__B);
 }
 
+//
 __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mul_epi32
-  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epi32(__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_mul_epi32
-  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epi32(__k,__A,__B);
 }
 
+//
 __m512i test_mm512_mask_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_mul_epi32
-  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: shl <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: ashr <8 x i64> %{{.*}}, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epi32(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mul_epu32
-  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_mul_epu32(__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_mul_epu32
-  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_mul_epu32(__k,__A,__B);
 }
 
+//
 __m512i test_mm512_mask_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_mul_epu32
-  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  //CHECK: and <8 x i64> %{{.*}}, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  //CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
-  //CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_mul_epu32(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_mullo_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_mullo_epi32
-  //CHECK: mul <16 x i32> %{{.*}}, %{{.*}}
-  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_mullo_epi32(__k,__A,__B);
 }
 
+//
 __m512i test_mm512_mask_mullo_epi32 (__mmask16 __k,__m512i __A, __m512i __B, __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_mullo_epi32
-  //CHECK: mul <16 x i32> %{{.*}}, %{{.*}}
-  //CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_mullo_epi32(__src,__k,__A,__B);
 }
 
+//
 __m512i test_mm512_mullo_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_mullo_epi32
-  //CHECK: mul <16 x i32>
   return _mm512_mullo_epi32(__A,__B);
 }
 
+//
 __m512i test_mm512_mullox_epi64 (__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mullox_epi64
-  // CHECK: mul <8 x i64>
   return (__m512i) _mm512_mullox_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_mullox_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_mullox_epi64
-  // CHECK: mul <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return (__m512i) _mm512_mask_mullox_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512d test_mm512_add_round_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_add_round_pd
-  // CHECK: @llvm.x86.avx512.add.pd.512
   return _mm512_add_round_pd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_add_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_add_round_pd
-  // CHECK: @llvm.x86.avx512.add.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_add_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_add_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_add_round_pd
-  // CHECK: @llvm.x86.avx512.add.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_add_round_pd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_add_pd
-  // CHECK: fadd <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_add_pd(__W,__U,__A,__B); 
+  return _mm512_mask_add_pd(__W,__U,__A,__B);
 }
+//
 __m512d test_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_add_pd
-  // CHECK: fadd <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_add_pd(__U,__A,__B); 
+  return _mm512_maskz_add_pd(__U,__A,__B);
 }
+//
 __m512 test_mm512_add_round_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_add_round_ps
-  // CHECK: @llvm.x86.avx512.add.ps.512
   return _mm512_add_round_ps(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_add_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_add_round_ps
-  // CHECK: @llvm.x86.avx512.add.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_add_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_add_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_add_round_ps
-  // CHECK: @llvm.x86.avx512.add.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_add_round_ps(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_add_ps
-  // CHECK: fadd <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_add_ps(__W,__U,__A,__B); 
+  return _mm512_mask_add_ps(__W,__U,__A,__B);
 }
+//
 __m512 test_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_add_ps
-  // CHECK: fadd <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_add_ps(__U,__A,__B); 
+  return _mm512_maskz_add_ps(__U,__A,__B);
 }
+//
 __m128 test_mm_add_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_add_round_ss
-  // CHECK: @llvm.x86.avx512.mask.add.ss.round
   return _mm_add_round_ss(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_add_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_add_round_ss
-  // CHECK: @llvm.x86.avx512.mask.add.ss.round
   return _mm_mask_add_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_maskz_add_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_add_round_ss
-  // CHECK: @llvm.x86.avx512.mask.add.ss.round
   return _mm_maskz_add_round_ss(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_add_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_add_ss
-  // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fadd float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_mask_add_ss(__W,__U,__A,__B); 
+  return _mm_mask_add_ss(__W,__U,__A,__B);
 }
+//
 __m128 test_mm_maskz_add_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_add_ss
-  // CHECK-NOT: @llvm.x86.avx512.mask.add.ss.round
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fadd float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_maskz_add_ss(__U,__A,__B); 
+  return _mm_maskz_add_ss(__U,__A,__B);
 }
+//
 __m128d test_mm_add_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_add_round_sd
-  // CHECK: @llvm.x86.avx512.mask.add.sd.round
   return _mm_add_round_sd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_add_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_add_round_sd
-  // CHECK: @llvm.x86.avx512.mask.add.sd.round
   return _mm_mask_add_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_maskz_add_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_add_round_sd
-  // CHECK: @llvm.x86.avx512.mask.add.sd.round
   return _mm_maskz_add_round_sd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_add_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_add_sd
-  // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fadd double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> {{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_mask_add_sd(__W,__U,__A,__B); 
+  return _mm_mask_add_sd(__W,__U,__A,__B);
 }
+//
 __m128d test_mm_maskz_add_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_add_sd
-  // CHECK-NOT: @llvm.x86.avx512.mask.add.sd.round
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fadd double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> {{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_maskz_add_sd(__U,__A,__B); 
+  return _mm_maskz_add_sd(__U,__A,__B);
 }
+//
 __m512d test_mm512_sub_round_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_sub_round_pd
-  // CHECK: @llvm.x86.avx512.sub.pd.512
   return _mm512_sub_round_pd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_sub_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_sub_round_pd
-  // CHECK: @llvm.x86.avx512.sub.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_sub_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_sub_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sub_round_pd
-  // CHECK: @llvm.x86.avx512.sub.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_sub_round_pd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_sub_pd
-  // CHECK: fsub <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_sub_pd(__W,__U,__A,__B); 
+  return _mm512_mask_sub_pd(__W,__U,__A,__B);
 }
+//
 __m512d test_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sub_pd
-  // CHECK: fsub <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_sub_pd(__U,__A,__B); 
+  return _mm512_maskz_sub_pd(__U,__A,__B);
 }
+//
 __m512 test_mm512_sub_round_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_sub_round_ps
-  // CHECK: @llvm.x86.avx512.sub.ps.512
   return _mm512_sub_round_ps(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_sub_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_sub_round_ps
-  // CHECK: @llvm.x86.avx512.sub.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_sub_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_sub_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sub_round_ps
-  // CHECK: @llvm.x86.avx512.sub.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_sub_round_ps(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_sub_ps
-  // CHECK: fsub <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_sub_ps(__W,__U,__A,__B); 
+  return _mm512_mask_sub_ps(__W,__U,__A,__B);
 }
+//
 __m512 test_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sub_ps
-  // CHECK: fsub <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_sub_ps(__U,__A,__B); 
+  return _mm512_maskz_sub_ps(__U,__A,__B);
 }
+//
 __m128 test_mm_sub_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_sub_round_ss
-  // CHECK: @llvm.x86.avx512.mask.sub.ss.round
   return _mm_sub_round_ss(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_sub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_sub_round_ss
-  // CHECK: @llvm.x86.avx512.mask.sub.ss.round
   return _mm_mask_sub_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_maskz_sub_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_sub_round_ss
-  // CHECK: @llvm.x86.avx512.mask.sub.ss.round
   return _mm_maskz_sub_round_ss(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_sub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_sub_ss
-  // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fsub float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_mask_sub_ss(__W,__U,__A,__B); 
+  return _mm_mask_sub_ss(__W,__U,__A,__B);
 }
+//
 __m128 test_mm_maskz_sub_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_sub_ss
-  // CHECK-NOT: @llvm.x86.avx512.mask.sub.ss.round
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fsub float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_maskz_sub_ss(__U,__A,__B); 
+  return _mm_maskz_sub_ss(__U,__A,__B);
 }
+//
 __m128d test_mm_sub_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_sub_round_sd
-  // CHECK: @llvm.x86.avx512.mask.sub.sd.round
   return _mm_sub_round_sd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_sub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_sub_round_sd
-  // CHECK: @llvm.x86.avx512.mask.sub.sd.round
   return _mm_mask_sub_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_maskz_sub_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_sub_round_sd
-  // CHECK: @llvm.x86.avx512.mask.sub.sd.round
   return _mm_maskz_sub_round_sd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_sub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_sub_sd
-  // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fsub double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> {{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_mask_sub_sd(__W,__U,__A,__B); 
+  return _mm_mask_sub_sd(__W,__U,__A,__B);
 }
+//
 __m128d test_mm_maskz_sub_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_sub_sd
-  // CHECK-NOT: @llvm.x86.avx512.mask.sub.sd.round
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fsub double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> {{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_maskz_sub_sd(__U,__A,__B); 
+  return _mm_maskz_sub_sd(__U,__A,__B);
 }
+//
 __m512d test_mm512_mul_round_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mul_round_pd
-  // CHECK: @llvm.x86.avx512.mul.pd.512
   return _mm512_mul_round_pd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_mul_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_round_pd
-  // CHECK: @llvm.x86.avx512.mul.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_mul_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_mul_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_round_pd
-  // CHECK: @llvm.x86.avx512.mul.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_mul_round_pd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_pd
-  // CHECK: fmul <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_mul_pd(__W,__U,__A,__B); 
+  return _mm512_mask_mul_pd(__W,__U,__A,__B);
 }
+//
 __m512d test_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_pd
-  // CHECK: fmul <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_mul_pd(__U,__A,__B); 
+  return _mm512_maskz_mul_pd(__U,__A,__B);
 }
+//
 __m512 test_mm512_mul_round_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mul_round_ps
-  // CHECK: @llvm.x86.avx512.mul.ps.512
   return _mm512_mul_round_ps(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_mul_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_round_ps
-  // CHECK: @llvm.x86.avx512.mul.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_mul_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_mul_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_round_ps
-  // CHECK: @llvm.x86.avx512.mul.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_mul_round_ps(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_ps
-  // CHECK: fmul <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_mul_ps(__W,__U,__A,__B); 
+  return _mm512_mask_mul_ps(__W,__U,__A,__B);
 }
+//
 __m512 test_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_ps
-  // CHECK: fmul <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_mul_ps(__U,__A,__B); 
+  return _mm512_maskz_mul_ps(__U,__A,__B);
 }
+//
 __m128 test_mm_mul_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mul_round_ss
-  // CHECK: @llvm.x86.avx512.mask.mul.ss.round
   return _mm_mul_round_ss(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_mul_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_round_ss
-  // CHECK: @llvm.x86.avx512.mask.mul.ss.round
   return _mm_mask_mul_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_maskz_mul_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_round_ss
-  // CHECK: @llvm.x86.avx512.mask.mul.ss.round
   return _mm_maskz_mul_round_ss(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_mul_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_ss
-  // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fmul float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_mask_mul_ss(__W,__U,__A,__B); 
+  return _mm_mask_mul_ss(__W,__U,__A,__B);
 }
+//
 __m128 test_mm_maskz_mul_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_ss
-  // CHECK-NOT: @llvm.x86.avx512.mask.mul.ss.round
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fmul float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_maskz_mul_ss(__U,__A,__B); 
+  return _mm_maskz_mul_ss(__U,__A,__B);
 }
+//
 __m128d test_mm_mul_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mul_round_sd
-  // CHECK: @llvm.x86.avx512.mask.mul.sd.round
   return _mm_mul_round_sd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_mul_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_round_sd
-  // CHECK: @llvm.x86.avx512.mask.mul.sd.round
   return _mm_mask_mul_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_maskz_mul_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_round_sd
-  // CHECK: @llvm.x86.avx512.mask.mul.sd.round
   return _mm_maskz_mul_round_sd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_mul_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_sd
-  // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fmul double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> {{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_mask_mul_sd(__W,__U,__A,__B); 
+  return _mm_mask_mul_sd(__W,__U,__A,__B);
 }
+//
 __m128d test_mm_maskz_mul_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_sd
-  // CHECK-NOT: @llvm.x86.avx512.mask.mul.sd.round
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fmul double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> {{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_maskz_mul_sd(__U,__A,__B); 
+  return _mm_maskz_mul_sd(__U,__A,__B);
 }
+//
 __m512d test_mm512_div_round_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_div_round_pd
-  // CHECK: @llvm.x86.avx512.div.pd.512
   return _mm512_div_round_pd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_mask_div_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_div_round_pd
-  // CHECK: @llvm.x86.avx512.div.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_div_round_pd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_maskz_div_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_div_round_pd
-  // CHECK: @llvm.x86.avx512.div.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_div_round_pd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512d test_mm512_div_pd(__m512d __a, __m512d __b) {
-  // CHECK-LABEL: @test_mm512_div_pd
-  // CHECK: fdiv <8 x double>
-  return _mm512_div_pd(__a,__b); 
+  return _mm512_div_pd(__a,__b);
 }
+//
 __m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d __b) {
-  // CHECK-LABEL: @test_mm512_mask_div_pd
-  // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_div_pd(__w,__u,__a,__b); 
+  return _mm512_mask_div_pd(__w,__u,__a,__b);
 }
+//
 __m512d test_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_div_pd
-  // CHECK: fdiv <8 x double> %{{.*}}, %{{.*}}
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_div_pd(__U,__A,__B); 
+  return _mm512_maskz_div_pd(__U,__A,__B);
 }
+//
 __m512 test_mm512_div_round_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_div_round_ps
-  // CHECK: @llvm.x86.avx512.div.ps.512
   return _mm512_div_round_ps(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_mask_div_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_div_round_ps
-  // CHECK: @llvm.x86.avx512.div.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_div_round_ps(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_div_round_ps
-  // CHECK: @llvm.x86.avx512.div.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_div_round_ps(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m512 test_mm512_div_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_div_ps
-  // CHECK: fdiv <16 x float>
-  return _mm512_div_ps(__A,__B); 
+  return _mm512_div_ps(__A,__B);
 }
+//
 __m512 test_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_div_ps
-  // CHECK: fdiv <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_div_ps(__W,__U,__A,__B); 
+  return _mm512_mask_div_ps(__W,__U,__A,__B);
 }
+//
 __m512 test_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_div_ps
-  // CHECK: fdiv <16 x float> %{{.*}}, %{{.*}}
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_div_ps(__U,__A,__B); 
+  return _mm512_maskz_div_ps(__U,__A,__B);
 }
+//
 __m128 test_mm_div_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_div_round_ss
-  // CHECK: @llvm.x86.avx512.mask.div.ss.round
   return _mm_div_round_ss(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_div_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_div_round_ss
-  // CHECK: @llvm.x86.avx512.mask.div.ss.round
   return _mm_mask_div_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_maskz_div_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_div_round_ss
-  // CHECK: @llvm.x86.avx512.mask.div.ss.round
   return _mm_maskz_div_round_ss(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128 test_mm_mask_div_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_div_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fdiv float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_mask_div_ss(__W,__U,__A,__B); 
+  return _mm_mask_div_ss(__W,__U,__A,__B);
 }
+//
 __m128 test_mm_maskz_div_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_div_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fdiv float %{{.*}}, %{{.*}}
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, float %{{.*}}, float %{{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float %{{.*}}, i64 0
-  return _mm_maskz_div_ss(__U,__A,__B); 
+  return _mm_maskz_div_ss(__U,__A,__B);
 }
+//
 __m128d test_mm_div_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_div_round_sd
-  // CHECK: @llvm.x86.avx512.mask.div.sd.round
   return _mm_div_round_sd(__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_div_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_div_round_sd
-  // CHECK: @llvm.x86.avx512.mask.div.sd.round
   return _mm_mask_div_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_maskz_div_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_div_round_sd
-  // CHECK: @llvm.x86.avx512.mask.div.sd.round
   return _mm_maskz_div_round_sd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+//
 __m128d test_mm_mask_div_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_div_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fdiv double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_mask_div_sd(__W,__U,__A,__B); 
+  return _mm_mask_div_sd(__W,__U,__A,__B);
 }
+//
 __m128d test_mm_maskz_div_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_div_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fdiv double %{{.*}}, %{{.*}}
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 %{{.*}}, double %{{.*}}, double %{{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
-  return _mm_maskz_div_sd(__U,__A,__B); 
+  return _mm_maskz_div_sd(__U,__A,__B);
 }
+//
 __m128 test_mm_max_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_max_round_ss
-  // CHECK: @llvm.x86.avx512.mask.max.ss.round
-  return _mm_max_round_ss(__A,__B,0x08); 
+  return _mm_max_round_ss(__A,__B,0x08);
 }
+//
 __m128 test_mm_mask_max_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_max_round_ss
-  // CHECK: @llvm.x86.avx512.mask.max.ss.round
-  return _mm_mask_max_round_ss(__W,__U,__A,__B,0x08); 
+  return _mm_mask_max_round_ss(__W,__U,__A,__B,0x08);
 }
+//
 __m128 test_mm_maskz_max_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_max_round_ss
-  // CHECK: @llvm.x86.avx512.mask.max.ss.round
-  return _mm_maskz_max_round_ss(__U,__A,__B,0x08); 
+  return _mm_maskz_max_round_ss(__U,__A,__B,0x08);
 }
+//
 __m128 test_mm_mask_max_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_max_ss
-  // CHECK: @llvm.x86.avx512.mask.max.ss.round
-  return _mm_mask_max_ss(__W,__U,__A,__B); 
+  return _mm_mask_max_ss(__W,__U,__A,__B);
 }
+//
 __m128 test_mm_maskz_max_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_max_ss
-  // CHECK: @llvm.x86.avx512.mask.max.ss.round
-  return _mm_maskz_max_ss(__U,__A,__B); 
+  return _mm_maskz_max_ss(__U,__A,__B);
 }
+//
 __m128d test_mm_max_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_max_round_sd
-  // CHECK: @llvm.x86.avx512.mask.max.sd.round
-  return _mm_max_round_sd(__A,__B,0x08); 
+  return _mm_max_round_sd(__A,__B,0x08);
 }
+//
 __m128d test_mm_mask_max_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_max_round_sd
-  // CHECK: @llvm.x86.avx512.mask.max.sd.round
-  return _mm_mask_max_round_sd(__W,__U,__A,__B,0x08); 
+  return _mm_mask_max_round_sd(__W,__U,__A,__B,0x08);
 }
+//
 __m128d test_mm_maskz_max_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_max_round_sd
-  // CHECK: @llvm.x86.avx512.mask.max.sd.round
-  return _mm_maskz_max_round_sd(__U,__A,__B,0x08); 
+  return _mm_maskz_max_round_sd(__U,__A,__B,0x08);
 }
+//
 __m128d test_mm_mask_max_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_max_sd
-  // CHECK: @llvm.x86.avx512.mask.max.sd.round
-  return _mm_mask_max_sd(__W,__U,__A,__B); 
+  return _mm_mask_max_sd(__W,__U,__A,__B);
 }
+//
 __m128d test_mm_maskz_max_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_max_sd
-  // CHECK: @llvm.x86.avx512.mask.max.sd.round
-  return _mm_maskz_max_sd(__U,__A,__B); 
+  return _mm_maskz_max_sd(__U,__A,__B);
 }
+//
 __m128 test_mm_min_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_min_round_ss
-  // CHECK: @llvm.x86.avx512.mask.min.ss.round
-  return _mm_min_round_ss(__A,__B,0x08); 
+  return _mm_min_round_ss(__A,__B,0x08);
 }
+//
 __m128 test_mm_mask_min_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_min_round_ss
-  // CHECK: @llvm.x86.avx512.mask.min.ss.round
-  return _mm_mask_min_round_ss(__W,__U,__A,__B,0x08); 
+  return _mm_mask_min_round_ss(__W,__U,__A,__B,0x08);
 }
+//
 __m128 test_mm_maskz_min_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_min_round_ss
-  // CHECK: @llvm.x86.avx512.mask.min.ss.round
-  return _mm_maskz_min_round_ss(__U,__A,__B,0x08); 
+  return _mm_maskz_min_round_ss(__U,__A,__B,0x08);
 }
+//
 __m128 test_mm_mask_min_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_min_ss
-  // CHECK: @llvm.x86.avx512.mask.min.ss.round
-  return _mm_mask_min_ss(__W,__U,__A,__B); 
+  return _mm_mask_min_ss(__W,__U,__A,__B);
 }
+//
 __m128 test_mm_maskz_min_ss(__mmask8 __U, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_min_ss
-  // CHECK: @llvm.x86.avx512.mask.min.ss.round
-  return _mm_maskz_min_ss(__U,__A,__B); 
+  return _mm_maskz_min_ss(__U,__A,__B);
 }
+//
 __m128d test_mm_min_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_min_round_sd
-  // CHECK: @llvm.x86.avx512.mask.min.sd.round
-  return _mm_min_round_sd(__A,__B,0x08); 
+  return _mm_min_round_sd(__A,__B,0x08);
 }
+//
 __m128d test_mm_mask_min_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_min_round_sd
-  // CHECK: @llvm.x86.avx512.mask.min.sd.round
-  return _mm_mask_min_round_sd(__W,__U,__A,__B,0x08); 
+  return _mm_mask_min_round_sd(__W,__U,__A,__B,0x08);
 }
+//
 __m128d test_mm_maskz_min_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_min_round_sd
-  // CHECK: @llvm.x86.avx512.mask.min.sd.round
-  return _mm_maskz_min_round_sd(__U,__A,__B,0x08); 
+  return _mm_maskz_min_round_sd(__U,__A,__B,0x08);
 }
+//
 __m128d test_mm_mask_min_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_min_sd
-  // CHECK: @llvm.x86.avx512.mask.min.sd.round
-  return _mm_mask_min_sd(__W,__U,__A,__B); 
+  return _mm_mask_min_sd(__W,__U,__A,__B);
 }
+//
 __m128d test_mm_maskz_min_sd(__mmask8 __U, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_min_sd
-  // CHECK: @llvm.x86.avx512.mask.min.sd.round
-  return _mm_maskz_min_sd(__U,__A,__B); 
+  return _mm_maskz_min_sd(__U,__A,__B);
 }
 
+//
 __m512 test_mm512_undefined(void) {
-  // CHECK-LABEL: @test_mm512_undefined
-  // CHECK: ret <16 x float> zeroinitializer
   return _mm512_undefined();
 }
 
+//
 __m512 test_mm512_undefined_ps(void) {
-  // CHECK-LABEL: @test_mm512_undefined_ps
-  // CHECK: ret <16 x float> zeroinitializer
   return _mm512_undefined_ps();
 }
 
+//
 __m512d test_mm512_undefined_pd(void) {
-  // CHECK-LABEL: @test_mm512_undefined_pd
-  // CHECK: ret <8 x double> zeroinitializer
   return _mm512_undefined_pd();
 }
 
+//
 __m512i test_mm512_undefined_epi32(void) {
-  // CHECK-LABEL: @test_mm512_undefined_epi32
-  // CHECK: ret <8 x i64> zeroinitializer
   return _mm512_undefined_epi32();
 }
 
+//
 __m512i test_mm512_cvtepi8_epi32(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi8_epi32
-  // CHECK: sext <16 x i8> %{{.*}} to <16 x i32>
-  return _mm512_cvtepi8_epi32(__A); 
+  return _mm512_cvtepi8_epi32(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi32
-  // CHECK: sext <16 x i8> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_cvtepi8_epi32(__W, __U, __A); 
+  return _mm512_mask_cvtepi8_epi32(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi32
-  // CHECK: sext <16 x i8> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_cvtepi8_epi32(__U, __A); 
+  return _mm512_maskz_cvtepi8_epi32(__U, __A);
 }
 
+//
 __m512i test_mm512_cvtepi8_epi64(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi8_epi64
-  // CHECK: sext <8 x i8> %{{.*}} to <8 x i64>
-  return _mm512_cvtepi8_epi64(__A); 
+  return _mm512_cvtepi8_epi64(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi64
-  // CHECK: sext <8 x i8> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_cvtepi8_epi64(__W, __U, __A); 
+  return _mm512_mask_cvtepi8_epi64(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi64
-  // CHECK: sext <8 x i8> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_cvtepi8_epi64(__U, __A); 
+  return _mm512_maskz_cvtepi8_epi64(__U, __A);
 }
 
+//
 __m512i test_mm512_cvtepi32_epi64(__m256i __X) {
-  // CHECK-LABEL: @test_mm512_cvtepi32_epi64
-  // CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
-  return _mm512_cvtepi32_epi64(__X); 
+  return _mm512_cvtepi32_epi64(__X);
 }
 
+//
 __m512i test_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi64
-  // CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_cvtepi32_epi64(__W, __U, __X); 
+  return _mm512_mask_cvtepi32_epi64(__W, __U, __X);
 }
 
+//
 __m512i test_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi64
-  // CHECK: sext <8 x i32> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_cvtepi32_epi64(__U, __X); 
+  return _mm512_maskz_cvtepi32_epi64(__U, __X);
 }
 
+//
 __m512i test_mm512_cvtepi16_epi32(__m256i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi16_epi32
-  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
-  return _mm512_cvtepi16_epi32(__A); 
+  return _mm512_cvtepi16_epi32(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi32
-  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_cvtepi16_epi32(__W, __U, __A); 
+  return _mm512_mask_cvtepi16_epi32(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi32
-  // CHECK: sext <16 x i16> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_cvtepi16_epi32(__U, __A); 
+  return _mm512_maskz_cvtepi16_epi32(__U, __A);
 }
 
+//
 __m512i test_mm512_cvtepi16_epi64(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi16_epi64
-  // CHECK: sext <8 x i16> %{{.*}} to <8 x i64>
-  return _mm512_cvtepi16_epi64(__A); 
+  return _mm512_cvtepi16_epi64(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi64
-  // CHECK: sext <8 x i16> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_cvtepi16_epi64(__W, __U, __A); 
+  return _mm512_mask_cvtepi16_epi64(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi64
-  // CHECK: sext <8 x i16> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_cvtepi16_epi64(__U, __A); 
+  return _mm512_maskz_cvtepi16_epi64(__U, __A);
 }
 
+//
 __m512i test_mm512_cvtepu8_epi32(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepu8_epi32
-  // CHECK: zext <16 x i8> %{{.*}} to <16 x i32>
-  return _mm512_cvtepu8_epi32(__A); 
+  return _mm512_cvtepu8_epi32(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi32
-  // CHECK: zext <16 x i8> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_cvtepu8_epi32(__W, __U, __A); 
+  return _mm512_mask_cvtepu8_epi32(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi32
-  // CHECK: zext <16 x i8> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_cvtepu8_epi32(__U, __A); 
+  return _mm512_maskz_cvtepu8_epi32(__U, __A);
 }
 
+//
 __m512i test_mm512_cvtepu8_epi64(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepu8_epi64
-  // CHECK: zext <8 x i8> %{{.*}} to <8 x i64>
-  return _mm512_cvtepu8_epi64(__A); 
+  return _mm512_cvtepu8_epi64(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi64
-  // CHECK: zext <8 x i8> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_cvtepu8_epi64(__W, __U, __A); 
+  return _mm512_mask_cvtepu8_epi64(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi64
-  // CHECK: zext <8 x i8> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_cvtepu8_epi64(__U, __A); 
+  return _mm512_maskz_cvtepu8_epi64(__U, __A);
 }
 
+//
 __m512i test_mm512_cvtepu32_epi64(__m256i __X) {
-  // CHECK-LABEL: @test_mm512_cvtepu32_epi64
-  // CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
-  return _mm512_cvtepu32_epi64(__X); 
+  return _mm512_cvtepu32_epi64(__X);
 }
 
+//
 __m512i test_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu32_epi64
-  // CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_cvtepu32_epi64(__W, __U, __X); 
+  return _mm512_mask_cvtepu32_epi64(__W, __U, __X);
 }
 
+//
 __m512i test_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_epi64
-  // CHECK: zext <8 x i32> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_cvtepu32_epi64(__U, __X); 
+  return _mm512_maskz_cvtepu32_epi64(__U, __X);
 }
 
+//
 __m512i test_mm512_cvtepu16_epi32(__m256i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepu16_epi32
-  // CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
-  return _mm512_cvtepu16_epi32(__A); 
+  return _mm512_cvtepu16_epi32(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi32
-  // CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_cvtepu16_epi32(__W, __U, __A); 
+  return _mm512_mask_cvtepu16_epi32(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi32
-  // CHECK: zext <16 x i16> %{{.*}} to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_cvtepu16_epi32(__U, __A); 
+  return _mm512_maskz_cvtepu16_epi32(__U, __A);
 }
 
+//
 __m512i test_mm512_cvtepu16_epi64(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepu16_epi64
-  // CHECK: zext <8 x i16> %{{.*}} to <8 x i64>
-  return _mm512_cvtepu16_epi64(__A); 
+  return _mm512_cvtepu16_epi64(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi64
-  // CHECK: zext <8 x i16> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_cvtepu16_epi64(__W, __U, __A); 
+  return _mm512_mask_cvtepu16_epi64(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi64
-  // CHECK: zext <8 x i16> %{{.*}} to <8 x i64>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_cvtepu16_epi64(__U, __A); 
+  return _mm512_maskz_cvtepu16_epi64(__U, __A);
 }
 
 
+//
 __m512i test_mm512_rol_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_rol_epi32
-  // CHECK: @llvm.fshl.v16i32
-  return _mm512_rol_epi32(__A, 5); 
+  return _mm512_rol_epi32(__A, 5);
 }
 
+//
 __m512i test_mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_rol_epi32
-  // CHECK: @llvm.fshl.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_rol_epi32(__W, __U, __A, 5); 
+  return _mm512_mask_rol_epi32(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_rol_epi32
-  // CHECK: @llvm.fshl.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_rol_epi32(__U, __A, 5); 
+  return _mm512_maskz_rol_epi32(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_rol_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_rol_epi64
-  // CHECK: @llvm.fshl.v8i64
-  return _mm512_rol_epi64(__A, 5); 
+  return _mm512_rol_epi64(__A, 5);
 }
 
+//
 __m512i test_mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_rol_epi64
-  // CHECK: @llvm.fshl.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_rol_epi64(__W, __U, __A, 5); 
+  return _mm512_mask_rol_epi64(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_rol_epi64
-  // CHECK: @llvm.fshl.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_rol_epi64(__U, __A, 5); 
+  return _mm512_maskz_rol_epi64(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_rolv_epi32(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_rolv_epi32
-  // CHECK: @llvm.fshl.v16i32
-  return _mm512_rolv_epi32(__A, __B); 
+  return _mm512_rolv_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_rolv_epi32
-  // CHECK: @llvm.fshl.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_rolv_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_rolv_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_rolv_epi32
-  // CHECK: @llvm.fshl.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_rolv_epi32(__U, __A, __B); 
+  return _mm512_maskz_rolv_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_rolv_epi64(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_rolv_epi64
-  // CHECK: @llvm.fshl.v8i64
-  return _mm512_rolv_epi64(__A, __B); 
+  return _mm512_rolv_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_rolv_epi64
-  // CHECK: @llvm.fshl.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_rolv_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_rolv_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_rolv_epi64
-  // CHECK: @llvm.fshl.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_rolv_epi64(__U, __A, __B); 
+  return _mm512_maskz_rolv_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_ror_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_ror_epi32
-  // CHECK: @llvm.fshr.v16i32
-  return _mm512_ror_epi32(__A, 5); 
+  return _mm512_ror_epi32(__A, 5);
 }
 
+//
 __m512i test_mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_ror_epi32
-  // CHECK: @llvm.fshr.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_ror_epi32(__W, __U, __A, 5); 
+  return _mm512_mask_ror_epi32(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_ror_epi32
-  // CHECK: @llvm.fshr.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_ror_epi32(__U, __A, 5); 
+  return _mm512_maskz_ror_epi32(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_ror_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_ror_epi64
-  // CHECK: @llvm.fshr.v8i64
-  return _mm512_ror_epi64(__A, 5); 
+  return _mm512_ror_epi64(__A, 5);
 }
 
+//
 __m512i test_mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_ror_epi64
-  // CHECK: @llvm.fshr.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_ror_epi64(__W, __U, __A, 5); 
+  return _mm512_mask_ror_epi64(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_ror_epi64
-  // CHECK: @llvm.fshr.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_ror_epi64(__U, __A, 5); 
+  return _mm512_maskz_ror_epi64(__U, __A, 5);
 }
 
 
+//
 __m512i test_mm512_rorv_epi32(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_rorv_epi32
-  // CHECK: @llvm.fshr.v16i32
-  return _mm512_rorv_epi32(__A, __B); 
+  return _mm512_rorv_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_rorv_epi32
-  // CHECK: @llvm.fshr.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_rorv_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_rorv_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_rorv_epi32
-  // CHECK: @llvm.fshr.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_rorv_epi32(__U, __A, __B); 
+  return _mm512_maskz_rorv_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_rorv_epi64(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_rorv_epi64
-  // CHECK: @llvm.fshr.v8i64
-  return _mm512_rorv_epi64(__A, __B); 
+  return _mm512_rorv_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_rorv_epi64
-  // CHECK: @llvm.fshr.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_rorv_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_rorv_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_rorv_epi64
-  // CHECK: @llvm.fshr.v8i64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_rorv_epi64(__U, __A, __B); 
+  return _mm512_maskz_rorv_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_slli_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_slli_epi32
-  // CHECK: @llvm.x86.avx512.pslli.d.512
-  return _mm512_slli_epi32(__A, 5); 
+  return _mm512_slli_epi32(__A, 5);
 }
 
+//
 __m512i test_mm512_slli_epi32_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_slli_epi32_2
-  // CHECK: @llvm.x86.avx512.pslli.d.512
-  return _mm512_slli_epi32(__A, __B); 
+  return _mm512_slli_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_slli_epi32
-  // CHECK: @llvm.x86.avx512.pslli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_slli_epi32(__W, __U, __A, 5); 
+  return _mm512_mask_slli_epi32(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_mask_slli_epi32_2(__m512i __W, __mmask16 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_slli_epi32_2
-  // CHECK: @llvm.x86.avx512.pslli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_slli_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_slli_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_slli_epi32
-  // CHECK: @llvm.x86.avx512.pslli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_slli_epi32(__U, __A, 5); 
+  return _mm512_maskz_slli_epi32(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_slli_epi32_2(__mmask16 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_slli_epi32_2
-  // CHECK: @llvm.x86.avx512.pslli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_slli_epi32(__U, __A, __B); 
+  return _mm512_maskz_slli_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_slli_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_slli_epi64
-  // CHECK: @llvm.x86.avx512.pslli.q.512
-  return _mm512_slli_epi64(__A, 5); 
+  return _mm512_slli_epi64(__A, 5);
 }
 
+//
 __m512i test_mm512_slli_epi64_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_slli_epi64_2
-  // CHECK: @llvm.x86.avx512.pslli.q.512
-  return _mm512_slli_epi64(__A, __B); 
+  return _mm512_slli_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_slli_epi64
-  // CHECK: @llvm.x86.avx512.pslli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_slli_epi64(__W, __U, __A, 5); 
+  return _mm512_mask_slli_epi64(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_mask_slli_epi64_2(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_slli_epi64_2
-  // CHECK: @llvm.x86.avx512.pslli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_slli_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_slli_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_slli_epi64
-  // CHECK: @llvm.x86.avx512.pslli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_slli_epi64(__U, __A, 5); 
+  return _mm512_maskz_slli_epi64(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_slli_epi64_2(__mmask8 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_slli_epi64_2
-  // CHECK: @llvm.x86.avx512.pslli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_slli_epi64(__U, __A, __B); 
+  return _mm512_maskz_slli_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_srli_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_srli_epi32
-  // CHECK: @llvm.x86.avx512.psrli.d.512
-  return _mm512_srli_epi32(__A, 5); 
+  return _mm512_srli_epi32(__A, 5);
 }
 
+//
 __m512i test_mm512_srli_epi32_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_srli_epi32_2
-  // CHECK: @llvm.x86.avx512.psrli.d.512
-  return _mm512_srli_epi32(__A, __B); 
+  return _mm512_srli_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_srli_epi32
-  // CHECK: @llvm.x86.avx512.psrli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_srli_epi32(__W, __U, __A, 5); 
+  return _mm512_mask_srli_epi32(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_mask_srli_epi32_2(__m512i __W, __mmask16 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_srli_epi32_2
-  // CHECK: @llvm.x86.avx512.psrli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_srli_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_srli_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_srli_epi32
-  // CHECK: @llvm.x86.avx512.psrli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_srli_epi32(__U, __A, 5); 
+  return _mm512_maskz_srli_epi32(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_srli_epi32_2(__mmask16 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srli_epi32_2
-  // CHECK: @llvm.x86.avx512.psrli.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_srli_epi32(__U, __A, __B); 
+  return _mm512_maskz_srli_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_srli_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_srli_epi64
-  // CHECK: @llvm.x86.avx512.psrli.q.512
-  return _mm512_srli_epi64(__A, 5); 
+  return _mm512_srli_epi64(__A, 5);
 }
 
+//
 __m512i test_mm512_srli_epi64_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_srli_epi64_2
-  // CHECK: @llvm.x86.avx512.psrli.q.512
-  return _mm512_srli_epi64(__A, __B); 
+  return _mm512_srli_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_srli_epi64
-  // CHECK: @llvm.x86.avx512.psrli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_srli_epi64(__W, __U, __A, 5); 
+  return _mm512_mask_srli_epi64(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_mask_srli_epi64_2(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_srli_epi64_2
-  // CHECK: @llvm.x86.avx512.psrli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_srli_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_srli_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_srli_epi64
-  // CHECK: @llvm.x86.avx512.psrli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_srli_epi64(__U, __A, 5); 
+  return _mm512_maskz_srli_epi64(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_srli_epi64_2(__mmask8 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srli_epi64_2
-  // CHECK: @llvm.x86.avx512.psrli.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_srli_epi64(__U, __A, __B); 
+  return _mm512_maskz_srli_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_load_epi32
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
-  return _mm512_mask_load_epi32(__W, __U, __P); 
+  return _mm512_mask_load_epi32(__W, __U, __P);
 }
 
+//
 __m512i test_mm512_maskz_load_epi32(__mmask16 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_load_epi32
-  // CHECK: @llvm.masked.load.v16i32.p0(ptr %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
-  return _mm512_maskz_load_epi32(__U, __P); 
+  return _mm512_maskz_load_epi32(__U, __P);
 }
 
+//
 __m512i test_mm512_mask_mov_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_mov_epi32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_mov_epi32(__W, __U, __A); 
+  return _mm512_mask_mov_epi32(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_mov_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_mov_epi32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_mov_epi32(__U, __A); 
+  return _mm512_maskz_mov_epi32(__U, __A);
 }
 
+//
 __m512i test_mm512_mask_mov_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_mov_epi64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_mov_epi64(__W, __U, __A); 
+  return _mm512_mask_mov_epi64(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_mov_epi64
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_mov_epi64(__U, __A); 
+  return _mm512_maskz_mov_epi64(__U, __A);
 }
 
+//
 __m512i test_mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_load_epi64
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
-  return _mm512_mask_load_epi64(__W, __U, __P); 
+  return _mm512_mask_load_epi64(__W, __U, __P);
 }
 
+//
 __m512i test_mm512_maskz_load_epi64(__mmask8 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_load_epi64
-  // CHECK: @llvm.masked.load.v8i64.p0(ptr %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
-  return _mm512_maskz_load_epi64(__U, __P); 
+  return _mm512_maskz_load_epi64(__U, __P);
 }
 
+//
 void test_mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_store_epi32
-  // CHECK: @llvm.masked.store.v16i32.p0(<16 x i32> %{{.*}}, ptr %{{.*}}, i32 64, <16 x i1> %{{.*}})
-  return _mm512_mask_store_epi32(__P, __U, __A); 
+  return _mm512_mask_store_epi32(__P, __U, __A);
 }
 
+//
 void test_mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_store_epi64
-  // CHECK: @llvm.masked.store.v8i64.p0(<8 x i64> %{{.*}}, ptr %{{.*}}, i32 64, <8 x i1> %{{.*}})
-  return _mm512_mask_store_epi64(__P, __U, __A); 
+  return _mm512_mask_store_epi64(__P, __U, __A);
 }
 
+//
 __m512d test_mm512_movedup_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_movedup_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   return _mm512_movedup_pd(__A);
 }
 
+//
 __m512d test_mm512_mask_movedup_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_movedup_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_movedup_pd(__W, __U, __A);
 }
 
+//
 __m512d test_mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_movedup_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_movedup_pd(__U, __A);
 }
 
+//
 int test_mm_comi_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_comi_round_sd
-  // CHECK: @llvm.x86.avx512.vcomi.sd
-  return _mm_comi_round_sd(__A, __B, 5, _MM_FROUND_NO_EXC); 
+  return _mm_comi_round_sd(__A, __B, 5, _MM_FROUND_NO_EXC);
 }
 
+//
 int test_mm_comi_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_comi_round_ss
-  // CHECK: @llvm.x86.avx512.vcomi.ss
-  return _mm_comi_round_ss(__A, __B, 5, _MM_FROUND_NO_EXC); 
+  return _mm_comi_round_ss(__A, __B, 5, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_fixupimm_round_pd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
-  return _mm512_fixupimm_round_pd(__A, __B, __C, 5, 8); 
+  return _mm512_fixupimm_round_pd(__A, __B, __C, 5, 8);
 }
 
+//
 __m512d test_mm512_mask_fixupimm_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_fixupimm_round_pd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
-  return _mm512_mask_fixupimm_round_pd(__A, __U, __B, __C, 5, 8); 
+  return _mm512_mask_fixupimm_round_pd(__A, __U, __B, __C, 5, 8);
 }
 
+//
 __m512d test_mm512_fixupimm_pd(__m512d __A, __m512d __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_fixupimm_pd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
-  return _mm512_fixupimm_pd(__A, __B, __C, 5); 
+  return _mm512_fixupimm_pd(__A, __B, __C, 5);
 }
 
+//
 __m512d test_mm512_mask_fixupimm_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_fixupimm_pd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
-  return _mm512_mask_fixupimm_pd(__A, __U, __B, __C, 5); 
+  return _mm512_mask_fixupimm_pd(__A, __U, __B, __C, 5);
 }
 
+//
 __m512d test_mm512_maskz_fixupimm_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fixupimm_round_pd
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.512
-  return _mm512_maskz_fixupimm_round_pd(__U, __A, __B, __C, 5, 8); 
+  return _mm512_maskz_fixupimm_round_pd(__U, __A, __B, __C, 5, 8);
 }
 
+//
 __m512d test_mm512_maskz_fixupimm_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fixupimm_pd
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.512
-  return _mm512_maskz_fixupimm_pd(__U, __A, __B, __C, 5); 
+  return _mm512_maskz_fixupimm_pd(__U, __A, __B, __C, 5);
 }
 
+//
 __m512 test_mm512_fixupimm_round_ps(__m512 __A, __m512 __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_fixupimm_round_ps
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
-  return _mm512_fixupimm_round_ps(__A, __B, __C, 5, 8); 
+  return _mm512_fixupimm_round_ps(__A, __B, __C, 5, 8);
 }
 
+//
 __m512 test_mm512_mask_fixupimm_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_fixupimm_round_ps
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
-  return _mm512_mask_fixupimm_round_ps(__A, __U, __B, __C, 5, 8); 
+  return _mm512_mask_fixupimm_round_ps(__A, __U, __B, __C, 5, 8);
 }
 
+//
 __m512 test_mm512_fixupimm_ps(__m512 __A, __m512 __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_fixupimm_ps
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
-  return _mm512_fixupimm_ps(__A, __B, __C, 5); 
+  return _mm512_fixupimm_ps(__A, __B, __C, 5);
 }
 
+//
 __m512 test_mm512_mask_fixupimm_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_fixupimm_ps
-  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
-  return _mm512_mask_fixupimm_ps(__A, __U, __B, __C, 5); 
+  return _mm512_mask_fixupimm_ps(__A, __U, __B, __C, 5);
 }
 
+//
 __m512 test_mm512_maskz_fixupimm_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fixupimm_round_ps
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.512
-  return _mm512_maskz_fixupimm_round_ps(__U, __A, __B, __C, 5, 8); 
+  return _mm512_maskz_fixupimm_round_ps(__U, __A, __B, __C, 5, 8);
 }
 
+//
 __m512 test_mm512_maskz_fixupimm_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fixupimm_ps
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.512
-  return _mm512_maskz_fixupimm_ps(__U, __A, __B, __C, 5); 
+  return _mm512_maskz_fixupimm_ps(__U, __A, __B, __C, 5);
 }
 
+//
 __m128d test_mm_fixupimm_round_sd(__m128d __A, __m128d __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_fixupimm_round_sd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_fixupimm_round_sd(__A, __B, __C, 5, 8); 
+  return _mm_fixupimm_round_sd(__A, __B, __C, 5, 8);
 }
 
+//
 __m128d test_mm_mask_fixupimm_round_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_mask_fixupimm_round_sd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_mask_fixupimm_round_sd(__A, __U, __B, __C, 5, 8); 
+  return _mm_mask_fixupimm_round_sd(__A, __U, __B, __C, 5, 8);
 }
 
+//
 __m128d test_mm_fixupimm_sd(__m128d __A, __m128d __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_fixupimm_sd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_fixupimm_sd(__A, __B, __C, 5); 
+  return _mm_fixupimm_sd(__A, __B, __C, 5);
 }
 
+//
 __m128d test_mm_mask_fixupimm_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_mask_fixupimm_sd
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_mask_fixupimm_sd(__A, __U, __B, __C, 5); 
+  return _mm_mask_fixupimm_sd(__A, __U, __B, __C, 5);
 }
 
+//
 __m128d test_mm_maskz_fixupimm_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_maskz_fixupimm_round_sd
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm
-  return _mm_maskz_fixupimm_round_sd(__U, __A, __B, __C, 5, 8); 
+  return _mm_maskz_fixupimm_round_sd(__U, __A, __B, __C, 5, 8);
 }
 
+//
 __m128d test_mm_maskz_fixupimm_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_maskz_fixupimm_sd
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm
-  return _mm_maskz_fixupimm_sd(__U, __A, __B, __C, 5); 
+  return _mm_maskz_fixupimm_sd(__U, __A, __B, __C, 5);
 }
 
+//
 __m128 test_mm_fixupimm_round_ss(__m128 __A, __m128 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_fixupimm_round_ss
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_fixupimm_round_ss(__A, __B, __C, 5, 8); 
+  return _mm_fixupimm_round_ss(__A, __B, __C, 5, 8);
 }
 
+//
 __m128 test_mm_mask_fixupimm_round_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_mask_fixupimm_round_ss
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_mask_fixupimm_round_ss(__A, __U, __B, __C, 5, 8); 
+  return _mm_mask_fixupimm_round_ss(__A, __U, __B, __C, 5, 8);
 }
 
+//
 __m128 test_mm_fixupimm_ss(__m128 __A, __m128 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_fixupimm_ss
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_fixupimm_ss(__A, __B, __C, 5); 
+  return _mm_fixupimm_ss(__A, __B, __C, 5);
 }
 
+//
 __m128 test_mm_mask_fixupimm_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_mask_fixupimm_ss
-  // CHECK: @llvm.x86.avx512.mask.fixupimm
-  return _mm_mask_fixupimm_ss(__A, __U, __B, __C, 5); 
+  return _mm_mask_fixupimm_ss(__A, __U, __B, __C, 5);
 }
 
+//
 __m128 test_mm_maskz_fixupimm_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_maskz_fixupimm_round_ss
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm
-  return _mm_maskz_fixupimm_round_ss(__U, __A, __B, __C, 5, 8); 
+  return _mm_maskz_fixupimm_round_ss(__U, __A, __B, __C, 5, 8);
 }
 
+//
 __m128 test_mm_maskz_fixupimm_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
-  // CHECK-LABEL: @test_mm_maskz_fixupimm_ss
-  // CHECK: @llvm.x86.avx512.maskz.fixupimm
-  return _mm_maskz_fixupimm_ss(__U, __A, __B, __C, 5); 
+  return _mm_maskz_fixupimm_ss(__U, __A, __B, __C, 5);
 }
 
+//
 __m128d test_mm_getexp_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_getexp_round_sd
-  // CHECK: @llvm.x86.avx512.mask.getexp.sd
-  return _mm_getexp_round_sd(__A, __B, 8); 
+  return _mm_getexp_round_sd(__A, __B, 8);
 }
 
+//
 __m128d test_mm_getexp_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_getexp_sd
-  // CHECK: @llvm.x86.avx512.mask.getexp.sd
-  return _mm_getexp_sd(__A, __B); 
+  return _mm_getexp_sd(__A, __B);
 }
 
+//
 __m128 test_mm_getexp_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_getexp_round_ss
-  // CHECK: @llvm.x86.avx512.mask.getexp.ss
-  return _mm_getexp_round_ss(__A, __B, 8); 
+  return _mm_getexp_round_ss(__A, __B, 8);
 }
 
+//
 __m128 test_mm_getexp_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_getexp_ss
-  // CHECK: @llvm.x86.avx512.mask.getexp.ss
-  return _mm_getexp_ss(__A, __B); 
+  return _mm_getexp_ss(__A, __B);
 }
 
+//
 __m128d test_mm_getmant_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_getmant_round_sd
-  // CHECK: @llvm.x86.avx512.mask.getmant.sd
-  return _mm_getmant_round_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8); 
+  return _mm_getmant_round_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8);
 }
 
+//
 __m128d test_mm_getmant_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_getmant_sd
-  // CHECK: @llvm.x86.avx512.mask.getmant.sd
-  return _mm_getmant_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src); 
+  return _mm_getmant_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src);
 }
 
+//
 __m128 test_mm_getmant_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_getmant_round_ss
-  // CHECK: @llvm.x86.avx512.mask.getmant.ss
-  return _mm_getmant_round_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8); 
+  return _mm_getmant_round_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8);
 }
 
+//
 __m128 test_mm_getmant_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_getmant_ss
-  // CHECK: @llvm.x86.avx512.mask.getmant.ss
-  return _mm_getmant_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src); 
+  return _mm_getmant_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src);
 }
 
+//
 __mmask16 test_mm512_kmov(__mmask16 __A) {
-  // CHECK-LABEL: @test_mm512_kmov
-  // CHECK: load i16, ptr %__A.addr.i, align 2{{$}}
-  return _mm512_kmov(__A); 
+  return _mm512_kmov(__A);
 }
 
+//
 __m512d test_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpackhi_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_unpackhi_pd(__W, __U, __A, __B); 
+  return _mm512_mask_unpackhi_pd(__W, __U, __A, __B);
 }
 #if __x86_64__
+//
 long long test_mm_cvt_roundsd_si64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundsd_si64
-  // CHECK: @llvm.x86.avx512.vcvtsd2si64
   return _mm_cvt_roundsd_si64(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 #endif
+//
 __m512i test_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask2_permutex2var_epi32(__A, __I, __U, __B); 
+  return _mm512_mask2_permutex2var_epi32(__A, __I, __U, __B);
 }
+//
 __m512i test_mm512_unpackhi_epi32(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpackhi_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  return _mm512_unpackhi_epi32(__A, __B); 
+  return _mm512_unpackhi_epi32(__A, __B);
 }
 
+//
 __m512d test_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpackhi_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_unpackhi_pd(__U, __A, __B); 
+  return _mm512_maskz_unpackhi_pd(__U, __A, __B);
 }
 #if __x86_64__
+//
 long long test_mm_cvt_roundsd_i64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundsd_i64
-  // CHECK: @llvm.x86.avx512.vcvtsd2si64
   return _mm_cvt_roundsd_i64(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 #endif
+//
 __m512d test_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask2_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask2_permutex2var_pd(__A, __I, __U, __B); 
+  return _mm512_mask2_permutex2var_pd(__A, __I, __U, __B);
 }
+//
 __m512i test_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_unpackhi_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_unpackhi_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512 test_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpackhi_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_unpackhi_ps(__W, __U, __A, __B); 
+  return _mm512_mask_unpackhi_ps(__W, __U, __A, __B);
 }
 
+//
 __m512 test_mm512_maskz_unpackhi_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpackhi_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_unpackhi_ps(__U, __A, __B); 
+  return _mm512_maskz_unpackhi_ps(__U, __A, __B);
 }
 
+//
 __m512d test_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpacklo_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_unpacklo_pd(__W, __U, __A, __B); 
+  return _mm512_mask_unpacklo_pd(__W, __U, __A, __B);
 }
 
+//
 __m512d test_mm512_maskz_unpacklo_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpacklo_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_unpacklo_pd(__U, __A, __B); 
+  return _mm512_maskz_unpacklo_pd(__U, __A, __B);
 }
 
+//
 __m512 test_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpacklo_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_unpacklo_ps(__W, __U, __A, __B); 
+  return _mm512_mask_unpacklo_ps(__W, __U, __A, __B);
 }
 
+//
 __m512 test_mm512_maskz_unpacklo_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpacklo_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_unpacklo_ps(__U, __A, __B); 
+  return _mm512_maskz_unpacklo_ps(__U, __A, __B);
 }
+//
 int test_mm_cvt_roundsd_si32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundsd_si32
-  // CHECK: @llvm.x86.avx512.vcvtsd2si32
   return _mm_cvt_roundsd_si32(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 int test_mm_cvt_roundsd_i32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundsd_i32
-  // CHECK: @llvm.x86.avx512.vcvtsd2si32
   return _mm_cvt_roundsd_i32(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned test_mm_cvt_roundsd_u32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundsd_u32
-  // CHECK: @llvm.x86.avx512.vcvtsd2usi32
   return _mm_cvt_roundsd_u32(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned test_mm_cvtsd_u32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtsd_u32
-  // CHECK: @llvm.x86.avx512.vcvtsd2usi32
-  return _mm_cvtsd_u32(__A); 
+  return _mm_cvtsd_u32(__A);
 }
 
+//
 int test_mm512_cvtsi512_si32(__m512i a) {
-  // CHECK-LABEL: test_mm512_cvtsi512_si32
-  // CHECK: %{{.*}} = extractelement <16 x i32> %{{.*}}, i32 0
   return _mm512_cvtsi512_si32(a);
 }
 
 #ifdef __x86_64__
+//
 unsigned long long test_mm_cvt_roundsd_u64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundsd_u64
-  // CHECK: @llvm.x86.avx512.vcvtsd2usi64
   return _mm_cvt_roundsd_u64(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned long long test_mm_cvtsd_u64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtsd_u64
-  // CHECK: @llvm.x86.avx512.vcvtsd2usi64
-  return _mm_cvtsd_u64(__A); 
+  return _mm_cvtsd_u64(__A);
 }
 #endif
 
+//
 int test_mm_cvt_roundss_si32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundss_si32
-  // CHECK: @llvm.x86.avx512.vcvtss2si32
   return _mm_cvt_roundss_si32(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 int test_mm_cvt_roundss_i32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundss_i32
-  // CHECK: @llvm.x86.avx512.vcvtss2si32
   return _mm_cvt_roundss_i32(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvt_roundss_si64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundss_si64
-  // CHECK: @llvm.x86.avx512.vcvtss2si64
   return _mm_cvt_roundss_si64(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 long long test_mm_cvt_roundss_i64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundss_i64
-  // CHECK: @llvm.x86.avx512.vcvtss2si64
   return _mm_cvt_roundss_i64(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 #endif
 
+//
 unsigned test_mm_cvt_roundss_u32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundss_u32
-  // CHECK: @llvm.x86.avx512.vcvtss2usi32
   return _mm_cvt_roundss_u32(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned test_mm_cvtss_u32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtss_u32
-  // CHECK: @llvm.x86.avx512.vcvtss2usi32
-  return _mm_cvtss_u32(__A); 
+  return _mm_cvtss_u32(__A);
 }
 
 #ifdef __x86_64__
+//
 unsigned long long test_mm_cvt_roundss_u64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvt_roundss_u64
-  // CHECK: @llvm.x86.avx512.vcvtss2usi64
   return _mm_cvt_roundss_u64(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned long long test_mm_cvtss_u64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtss_u64
-  // CHECK: @llvm.x86.avx512.vcvtss2usi64
-  return _mm_cvtss_u64(__A); 
+  return _mm_cvtss_u64(__A);
 }
 #endif
 
+//
 int test_mm_cvtt_roundsd_i32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundsd_i32
-  // CHECK: @llvm.x86.avx512.cvttsd2si
   return _mm_cvtt_roundsd_i32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 int test_mm_cvtt_roundsd_si32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundsd_si32
-  // CHECK: @llvm.x86.avx512.cvttsd2si
   return _mm_cvtt_roundsd_si32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 int test_mm_cvttsd_i32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvttsd_i32
-  // CHECK: @llvm.x86.avx512.cvttsd2si
-  return _mm_cvttsd_i32(__A); 
+  return _mm_cvttsd_i32(__A);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvtt_roundsd_si64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundsd_si64
-  // CHECK: @llvm.x86.avx512.cvttsd2si64
   return _mm_cvtt_roundsd_si64(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 long long test_mm_cvtt_roundsd_i64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundsd_i64
-  // CHECK: @llvm.x86.avx512.cvttsd2si64
   return _mm_cvtt_roundsd_i64(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 long long test_mm_cvttsd_i64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvttsd_i64
-  // CHECK: @llvm.x86.avx512.cvttsd2si64
-  return _mm_cvttsd_i64(__A); 
+  return _mm_cvttsd_i64(__A);
 }
 #endif
 
+//
 unsigned test_mm_cvtt_roundsd_u32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundsd_u32
-  // CHECK: @llvm.x86.avx512.cvttsd2usi
   return _mm_cvtt_roundsd_u32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned test_mm_cvttsd_u32(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvttsd_u32
-  // CHECK: @llvm.x86.avx512.cvttsd2usi
-  return _mm_cvttsd_u32(__A); 
+  return _mm_cvttsd_u32(__A);
 }
 
 #ifdef __x86_64__
+//
 unsigned long long test_mm_cvtt_roundsd_u64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundsd_u64
-  // CHECK: @llvm.x86.avx512.cvttsd2usi64
   return _mm_cvtt_roundsd_u64(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned long long test_mm_cvttsd_u64(__m128d __A) {
-  // CHECK-LABEL: @test_mm_cvttsd_u64
-  // CHECK: @llvm.x86.avx512.cvttsd2usi64
-  return _mm_cvttsd_u64(__A); 
+  return _mm_cvttsd_u64(__A);
 }
 #endif
 
+//
 int test_mm_cvtt_roundss_i32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundss_i32
-  // CHECK: @llvm.x86.avx512.cvttss2si
   return _mm_cvtt_roundss_i32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 int test_mm_cvtt_roundss_si32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundss_si32
-  // CHECK: @llvm.x86.avx512.cvttss2si
   return _mm_cvtt_roundss_si32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 int test_mm_cvttss_i32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvttss_i32
-  // CHECK: @llvm.x86.avx512.cvttss2si
-  return _mm_cvttss_i32(__A); 
+  return _mm_cvttss_i32(__A);
 }
 
 #ifdef __x86_64__
+//
 float test_mm_cvtt_roundss_i64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundss_i64
-  // CHECK: @llvm.x86.avx512.cvttss2si64
   return _mm_cvtt_roundss_i64(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 long long test_mm_cvtt_roundss_si64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundss_si64
-  // CHECK: @llvm.x86.avx512.cvttss2si64
   return _mm_cvtt_roundss_si64(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 long long test_mm_cvttss_i64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvttss_i64
-  // CHECK: @llvm.x86.avx512.cvttss2si64
-  return _mm_cvttss_i64(__A); 
+  return _mm_cvttss_i64(__A);
 }
 #endif
 
+//
 unsigned test_mm_cvtt_roundss_u32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundss_u32
-  // CHECK: @llvm.x86.avx512.cvttss2usi
   return _mm_cvtt_roundss_u32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned test_mm_cvttss_u32(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvttss_u32
-  // CHECK: @llvm.x86.avx512.cvttss2usi
-  return _mm_cvttss_u32(__A); 
+  return _mm_cvttss_u32(__A);
 }
 
 #ifdef __x86_64__
+//
 unsigned long long test_mm_cvtt_roundss_u64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvtt_roundss_u64
-  // CHECK: @llvm.x86.avx512.cvttss2usi64
   return _mm_cvtt_roundss_u64(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 unsigned long long test_mm_cvttss_u64(__m128 __A) {
-  // CHECK-LABEL: @test_mm_cvttss_u64
-  // CHECK: @llvm.x86.avx512.cvttss2usi64
-  return _mm_cvttss_u64(__A); 
+  return _mm_cvttss_u64(__A);
 }
 #endif
 
-__m512i test_mm512_cvtt_roundps_epu32(__m512 __A) 
+//
+__m512i test_mm512_cvtt_roundps_epu32(__m512 __A)
 {
-    // CHECK-LABEL: @test_mm512_cvtt_roundps_epu32
-    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
     return _mm512_cvtt_roundps_epu32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_mask_cvtt_roundps_epu32(__m512i __W, __mmask16 __U, __m512 __A)
 {
-    // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epu32
-    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
     return _mm512_mask_cvtt_roundps_epu32(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_maskz_cvtt_roundps_epu32( __mmask16 __U, __m512 __A)
 {
-    // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epu32
-    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
 
     return _mm512_maskz_cvtt_roundps_epu32(__U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_cvt_roundps_ph(__m512  __A)
 {
-    // CHECK-LABEL: @test_mm512_cvt_roundps_ph
-    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
     return _mm512_cvt_roundps_ph(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_mask_cvt_roundps_ph(__m256i __W , __mmask16 __U, __m512  __A)
 {
-    // CHECK-LABEL: @test_mm512_mask_cvt_roundps_ph
-    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
     return _mm512_mask_cvt_roundps_ph(__W, __U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_maskz_cvt_roundps_ph(__mmask16 __U, __m512  __A)
 {
-    // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_ph
-    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
     return _mm512_maskz_cvt_roundps_ph(__U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_cvt_roundph_ps(__m256i __A)
 {
-    // CHECK-LABEL: @test_mm512_cvt_roundph_ps
-    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512(
     return _mm512_cvt_roundph_ps(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_cvt_roundph_ps(__m512 __W, __mmask16 __U, __m256i __A)
 {
-    // CHECK-LABEL: @test_mm512_mask_cvt_roundph_ps
-    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512(
     return _mm512_mask_cvt_roundph_ps(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_cvt_roundph_ps(__mmask16 __U, __m256i __A)
 {
-    // CHECK-LABEL: @test_mm512_maskz_cvt_roundph_ps
-    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512(
     return _mm512_maskz_cvt_roundph_ps(__U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_cvt_roundepi32_ps( __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvt_roundepi32_ps
-  // CHECK: @llvm.x86.avx512.sitofp.round.v16f32.v16i32
   return _mm512_cvt_roundepi32_ps(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_cvt_roundepi32_ps(__m512 __W, __mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundepi32_ps
-  // CHECK: @llvm.x86.avx512.sitofp.round.v16f32.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvt_roundepi32_ps(__W,__U,__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_cvt_roundepi32_ps(__mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepi32_ps
-  // CHECK: @llvm.x86.avx512.sitofp.round.v16f32.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvt_roundepi32_ps(__U,__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_cvt_roundepu32_ps(__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvt_roundepu32_ps
-  // CHECK: @llvm.x86.avx512.uitofp.round.v16f32.v16i32
   return _mm512_cvt_roundepu32_ps(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_cvt_roundepu32_ps(__m512 __W, __mmask16 __U,__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundepu32_ps
-  // CHECK: @llvm.x86.avx512.uitofp.round.v16f32.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvt_roundepu32_ps(__W,__U,__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_cvt_roundepu32_ps(__mmask16 __U,__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepu32_ps
-  // CHECK: @llvm.x86.avx512.uitofp.round.v16f32.v16i32
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvt_roundepu32_ps(__U,__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256 test_mm512_cvt_roundpd_ps(__m512d A)
 {
-  // CHECK-LABEL: @test_mm512_cvt_roundpd_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
   return _mm512_cvt_roundpd_ps(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256 test_mm512_mask_cvt_roundpd_ps(__m256 W, __mmask8 U,__m512d A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
   return _mm512_mask_cvt_roundpd_ps(W,U,A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256 test_mm512_maskz_cvt_roundpd_ps(__mmask8 U, __m512d A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_ps
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
   return _mm512_maskz_cvt_roundpd_ps(U,A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_cvtt_roundpd_epi32(__m512d A)
 {
-  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
   return _mm512_cvtt_roundpd_epi32(A,_MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_mask_cvtt_roundpd_epi32(__m256i W, __mmask8 U, __m512d A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
   return _mm512_mask_cvtt_roundpd_epi32(W,U,A,_MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_maskz_cvtt_roundpd_epi32(__mmask8 U, __m512d A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
   return _mm512_maskz_cvtt_roundpd_epi32(U,A,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_cvtt_roundps_epi32(__m512 A)
 {
-  // CHECK-LABEL: @test_mm512_cvtt_roundps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
   return _mm512_cvtt_roundps_epi32(A,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_mask_cvtt_roundps_epi32(__m512i W,__mmask16 U, __m512 A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
   return _mm512_mask_cvtt_roundps_epi32(W,U,A,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_maskz_cvtt_roundps_epi32(__mmask16 U, __m512 A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
   return _mm512_maskz_cvtt_roundps_epi32(U,A,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_cvt_roundps_epi32(__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_cvt_roundps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
   return _mm512_cvt_roundps_epi32(__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_mask_cvt_roundps_epi32(__m512i __W,__mmask16 __U,__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
   return _mm512_mask_cvt_roundps_epi32(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_maskz_cvt_roundps_epi32(__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
   return _mm512_maskz_cvt_roundps_epi32(__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_cvt_roundpd_epi32(__m512d A)
 {
-  // CHECK-LABEL: @test_mm512_cvt_roundpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
   return _mm512_cvt_roundpd_epi32(A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_mask_cvt_roundpd_epi32(__m256i W,__mmask8 U,__m512d A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
   return _mm512_mask_cvt_roundpd_epi32(W,U,A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_maskz_cvt_roundpd_epi32(__mmask8 U, __m512d A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epi32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
   return _mm512_maskz_cvt_roundpd_epi32(U,A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_cvt_roundps_epu32(__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_cvt_roundps_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
   return _mm512_cvt_roundps_epu32(__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_mask_cvt_roundps_epu32(__m512i __W,__mmask16 __U,__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
   return _mm512_mask_cvt_roundps_epu32(__W,__U,__A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_maskz_cvt_roundps_epu32(__mmask16 __U,__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
   return _mm512_maskz_cvt_roundps_epu32(__U,__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_cvt_roundpd_epu32(__m512d A)
 {
-  // CHECK-LABEL: @test_mm512_cvt_roundpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
   return _mm512_cvt_roundpd_epu32(A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_mask_cvt_roundpd_epu32(__m256i W, __mmask8 U, __m512d A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
   return _mm512_mask_cvt_roundpd_epu32(W,U,A,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
-__m256i test_mm512_maskz_cvt_roundpd_epu32(__mmask8 U, __m512d A) 
+//
+__m256i test_mm512_maskz_cvt_roundpd_epu32(__mmask8 U, __m512d A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
   return _mm512_maskz_cvt_roundpd_epu32(U, A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask2_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask2_permutex2var_ps(__A, __I, __U, __B); 
+  return _mm512_mask2_permutex2var_ps(__A, __I, __U, __B);
 }
 
+//
 __m512i test_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask2_permutex2var_epi64(__A, __I, __U, __B); 
+  return _mm512_mask2_permutex2var_epi64(__A, __I, __U, __B);
 }
 
+//
 __m512d test_mm512_permute_pd(__m512d __X) {
-  // CHECK-LABEL: @test_mm512_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   return _mm512_permute_pd(__X, 2);
 }
 
+//
 __m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) {
-  // CHECK-LABEL: @test_mm512_mask_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permute_pd(__W, __U, __X, 2);
 }
 
+//
 __m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) {
-  // CHECK-LABEL: @test_mm512_maskz_permute_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_permute_pd(__U, __X, 2);
 }
 
+//
 __m512 test_mm512_permute_ps(__m512 __X) {
-  // CHECK-LABEL: @test_mm512_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
   return _mm512_permute_ps(__X, 2);
 }
 
+//
 __m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) {
-  // CHECK-LABEL: @test_mm512_mask_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_permute_ps(__W, __U, __X, 2);
 }
 
+//
 __m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) {
-  // CHECK-LABEL: @test_mm512_maskz_permute_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_permute_ps(__U, __X, 2);
 }
 
+//
 __m512d test_mm512_permutevar_pd(__m512d __A, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_permutevar_pd
-  // CHECK: @llvm.x86.avx512.vpermilvar.pd.512
-  return _mm512_permutevar_pd(__A, __C); 
+  return _mm512_permutevar_pd(__A, __C);
 }
 
+//
 __m512d test_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_permutevar_pd
-  // CHECK: @llvm.x86.avx512.vpermilvar.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_permutevar_pd(__W, __U, __A, __C); 
+  return _mm512_mask_permutevar_pd(__W, __U, __A, __C);
 }
 
+//
 __m512d test_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_permutevar_pd
-  // CHECK: @llvm.x86.avx512.vpermilvar.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_permutevar_pd(__U, __A, __C); 
+  return _mm512_maskz_permutevar_pd(__U, __A, __C);
 }
 
+//
 __m512 test_mm512_permutevar_ps(__m512 __A, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_permutevar_ps
-  // CHECK: @llvm.x86.avx512.vpermilvar.ps.512
-  return _mm512_permutevar_ps(__A, __C); 
+  return _mm512_permutevar_ps(__A, __C);
 }
 
+//
 __m512 test_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_permutevar_ps
-  // CHECK: @llvm.x86.avx512.vpermilvar.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_permutevar_ps(__W, __U, __A, __C); 
+  return _mm512_mask_permutevar_ps(__W, __U, __A, __C);
 }
 
+//
 __m512 test_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_permutevar_ps
-  // CHECK: @llvm.x86.avx512.vpermilvar.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_permutevar_ps(__U, __A, __C); 
+  return _mm512_maskz_permutevar_ps(__U, __A, __C);
 }
 
+//
 __m512i test_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
-  return _mm512_permutex2var_epi32(__A, __I, __B); 
+  return _mm512_permutex2var_epi32(__A, __I, __B);
 }
 
+//
 __m512i test_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi32
-  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_permutex2var_epi32(__U, __A, __I, __B); 
+  return _mm512_maskz_permutex2var_epi32(__U, __A, __I, __B);
 }
 
+//
 __m512i test_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, __m512i __I, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi32 
-  // CHECK: @llvm.x86.avx512.vpermi2var.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_permutex2var_epi32 (__A,__U,__I,__B);
 }
 
+//
 __m512d test_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_permutex2var_pd 
-  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
   return _mm512_permutex2var_pd (__A, __I,__B);
 }
 
+//
 __m512d test_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_permutex2var_pd 
-  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permutex2var_pd (__A,__U,__I,__B);
 }
 
+//
 __m512d test_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_permutex2var_pd
-  // CHECK: @llvm.x86.avx512.vpermi2var.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_permutex2var_pd(__U, __A, __I, __B); 
+  return _mm512_maskz_permutex2var_pd(__U, __A, __I, __B);
 }
 
+//
 __m512 test_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_permutex2var_ps 
-  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
   return _mm512_permutex2var_ps (__A, __I, __B);
 }
 
+//
 __m512 test_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_permutex2var_ps 
-  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_permutex2var_ps (__A,__U,__I,__B);
 }
 
+//
 __m512 test_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_permutex2var_ps
-  // CHECK: @llvm.x86.avx512.vpermi2var.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_permutex2var_ps(__U, __A, __I, __B); 
+  return _mm512_maskz_permutex2var_ps(__U, __A, __I, __B);
 }
 
+//
 __m512i test_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B){
-  // CHECK-LABEL: @test_mm512_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
   return _mm512_permutex2var_epi64(__A, __I, __B);
 }
 
+//
 __m512i test_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, __m512i __B){
-  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_permutex2var_epi64(__A, __U, __I, __B);
 }
 
+//
 __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi64
-  // CHECK: @llvm.x86.avx512.vpermi2var.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
 }
+//
 __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_testn_epi32_mask
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
-  return _mm512_testn_epi32_mask(__A, __B); 
+  return _mm512_testn_epi32_mask(__A, __B);
 }
 
+//
 __mmask16 test_mm512_mask_testn_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_testn_epi32_mask
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
-  return _mm512_mask_testn_epi32_mask(__U, __A, __B); 
+  return _mm512_mask_testn_epi32_mask(__U, __A, __B);
 }
 
+//
 __mmask8 test_mm512_testn_epi64_mask(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_testn_epi64_mask
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
-  return _mm512_testn_epi64_mask(__A, __B); 
+  return _mm512_testn_epi64_mask(__A, __B);
 }
 
+//
 __mmask8 test_mm512_mask_testn_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_testn_epi64_mask
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
-  return _mm512_mask_testn_epi64_mask(__U, __A, __B); 
+  return _mm512_mask_testn_epi64_mask(__U, __A, __B);
 }
 
+//
 __mmask16 test_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_test_epi32_mask 
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
   return _mm512_mask_test_epi32_mask (__U,__A,__B);
 }
 
+//
 __mmask8 test_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_test_epi64_mask 
-  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
-  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return _mm512_mask_test_epi64_mask (__U,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_unpackhi_epi32(__U, __A, __B); 
+  return _mm512_maskz_unpackhi_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_unpackhi_epi64(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpackhi_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  return _mm512_unpackhi_epi64(__A, __B); 
+  return _mm512_unpackhi_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_unpackhi_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_unpackhi_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_unpackhi_epi64(__U, __A, __B); 
+  return _mm512_maskz_unpackhi_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_unpacklo_epi32(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpacklo_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  return _mm512_unpacklo_epi32(__A, __B); 
+  return _mm512_unpacklo_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_unpacklo_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_unpacklo_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_unpacklo_epi32(__U, __A, __B); 
+  return _mm512_maskz_unpacklo_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_unpacklo_epi64(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_unpacklo_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  return _mm512_unpacklo_epi64(__A, __B); 
+  return _mm512_unpacklo_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_unpacklo_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_unpacklo_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_unpacklo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_unpacklo_epi64(__U, __A, __B); 
+  return _mm512_maskz_unpacklo_epi64(__U, __A, __B);
 }
 
+//
 __m128d test_mm_roundscale_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_roundscale_round_sd
-  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
-  return _mm_roundscale_round_sd(__A, __B, 3, _MM_FROUND_NO_EXC); 
+  return _mm_roundscale_round_sd(__A, __B, 3, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_roundscale_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_roundscale_sd
-  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
-  return _mm_roundscale_sd(__A, __B, 3); 
+  return _mm_roundscale_sd(__A, __B, 3);
 }
 
+//
 __m128d test_mm_mask_roundscale_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
     return _mm_mask_roundscale_sd(__W,__U,__A,__B,3);
 }
 
+//
 __m128d test_mm_mask_roundscale_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
     return _mm_mask_roundscale_round_sd(__W,__U,__A,__B,3,_MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_roundscale_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
     return _mm_maskz_roundscale_sd(__U,__A,__B,3);
 }
 
+//
 __m128d test_mm_maskz_roundscale_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
     return _mm_maskz_roundscale_round_sd(__U,__A,__B,3,_MM_FROUND_NO_EXC );
 }
 
+//
 __m128 test_mm_roundscale_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_roundscale_round_ss
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
   return _mm_roundscale_round_ss(__A, __B, 3, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_roundscale_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_roundscale_ss
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
-  return _mm_roundscale_ss(__A, __B, 3); 
+  return _mm_roundscale_ss(__A, __B, 3);
 }
 
+//
 __m128 test_mm_mask_roundscale_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_roundscale_ss
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
     return _mm_mask_roundscale_ss(__W,__U,__A,__B,3);
 }
 
+//
 __m128 test_mm_maskz_roundscale_round_ss( __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_roundscale_round_ss
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
     return _mm_maskz_roundscale_round_ss(__U,__A,__B,3,_MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_roundscale_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_roundscale_ss
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
     return _mm_maskz_roundscale_ss(__U,__A,__B,3);
 }
 
+//
 __m512d test_mm512_scalef_round_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_scalef_round_pd
-  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
   return _mm512_scalef_round_pd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_mask_scalef_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_scalef_round_pd
-  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
   return _mm512_mask_scalef_round_pd(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_scalef_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_scalef_round_pd
-  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
   return _mm512_maskz_scalef_round_pd(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_scalef_pd(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_scalef_pd
-  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
-  return _mm512_scalef_pd(__A, __B); 
+  return _mm512_scalef_pd(__A, __B);
 }
 
+//
 __m512d test_mm512_mask_scalef_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_scalef_pd
-  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
-  return _mm512_mask_scalef_pd(__W, __U, __A, __B); 
+  return _mm512_mask_scalef_pd(__W, __U, __A, __B);
 }
 
+//
 __m512d test_mm512_maskz_scalef_pd(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_scalef_pd
-  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
-  return _mm512_maskz_scalef_pd(__U, __A, __B); 
+  return _mm512_maskz_scalef_pd(__U, __A, __B);
 }
 
+//
 __m512 test_mm512_scalef_round_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_scalef_round_ps
-  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
   return _mm512_scalef_round_ps(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_scalef_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_scalef_round_ps
-  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
   return _mm512_mask_scalef_round_ps(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_scalef_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_scalef_round_ps
-  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
   return _mm512_maskz_scalef_round_ps(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_scalef_ps(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_scalef_ps
-  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
-  return _mm512_scalef_ps(__A, __B); 
+  return _mm512_scalef_ps(__A, __B);
 }
 
+//
 __m512 test_mm512_mask_scalef_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_scalef_ps
-  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
-  return _mm512_mask_scalef_ps(__W, __U, __A, __B); 
+  return _mm512_mask_scalef_ps(__W, __U, __A, __B);
 }
 
+//
 __m512 test_mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_scalef_ps
-  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
-  return _mm512_maskz_scalef_ps(__U, __A, __B); 
+  return _mm512_maskz_scalef_ps(__U, __A, __B);
 }
 
+//
 __m128d test_mm_scalef_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_scalef_round_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %2, i8 -1, i32 11)
   return _mm_scalef_round_sd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_scalef_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_scalef_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef
-  return _mm_scalef_sd(__A, __B); 
+  return _mm_scalef_sd(__A, __B);
 }
 
+//
 __m128d test_mm_mask_scalef_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_scalef_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef.sd
   return _mm_mask_scalef_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_scalef_round_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 11)
     return _mm_mask_scalef_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_scalef_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_scalef_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef.sd
     return _mm_maskz_scalef_sd(__U, __A, __B);
 }
 
+//
 __m128d test_mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_scalef_round_sd
-  // CHECK: @llvm.x86.avx512.mask.scalef.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 11)
     return _mm_maskz_scalef_round_sd(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_scalef_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_scalef_round_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 -1, i32 11)
   return _mm_scalef_round_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_scalef_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_scalef_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss
-  return _mm_scalef_ss(__A, __B); 
+  return _mm_scalef_ss(__A, __B);
 }
 
+//
 __m128 test_mm_mask_scalef_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_scalef_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss
     return _mm_mask_scalef_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_scalef_round_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11)
     return _mm_mask_scalef_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_scalef_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_scalef_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss
     return _mm_maskz_scalef_ss(__U, __A, __B);
 }
 
+//
 __m128 test_mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_scalef_round_ss
-  // CHECK: @llvm.x86.avx512.mask.scalef.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11)
     return _mm_maskz_scalef_round_ss(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_srai_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_srai_epi32
-  // CHECK: @llvm.x86.avx512.psrai.d.512
-  return _mm512_srai_epi32(__A, 5); 
+  return _mm512_srai_epi32(__A, 5);
 }
 
+//
 __m512i test_mm512_srai_epi32_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_srai_epi32_2
-  // CHECK: @llvm.x86.avx512.psrai.d.512
-  return _mm512_srai_epi32(__A, __B); 
+  return _mm512_srai_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_srai_epi32
-  // CHECK: @llvm.x86.avx512.psrai.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_srai_epi32(__W, __U, __A, 5); 
+  return _mm512_mask_srai_epi32(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_mask_srai_epi32_2(__m512i __W, __mmask16 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_srai_epi32_2
-  // CHECK: @llvm.x86.avx512.psrai.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_srai_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_srai_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_srai_epi32
-  // CHECK: @llvm.x86.avx512.psrai.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_srai_epi32(__U, __A, 5); 
+  return _mm512_maskz_srai_epi32(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_srai_epi32_2(__mmask16 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srai_epi32_2
-  // CHECK: @llvm.x86.avx512.psrai.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_srai_epi32(__U, __A, __B); 
+  return _mm512_maskz_srai_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_srai_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_srai_epi64
-  // CHECK: @llvm.x86.avx512.psrai.q.512
-  return _mm512_srai_epi64(__A, 5); 
+  return _mm512_srai_epi64(__A, 5);
 }
 
+//
 __m512i test_mm512_srai_epi64_2(__m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_srai_epi64_2
-  // CHECK: @llvm.x86.avx512.psrai.q.512
-  return _mm512_srai_epi64(__A, __B); 
+  return _mm512_srai_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_srai_epi64
-  // CHECK: @llvm.x86.avx512.psrai.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_srai_epi64(__W, __U, __A, 5); 
+  return _mm512_mask_srai_epi64(__W, __U, __A, 5);
 }
 
+//
 __m512i test_mm512_mask_srai_epi64_2(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_mask_srai_epi64_2
-  // CHECK: @llvm.x86.avx512.psrai.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_srai_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_srai_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_srai_epi64
-  // CHECK: @llvm.x86.avx512.psrai.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_srai_epi64(__U, __A, 5); 
+  return _mm512_maskz_srai_epi64(__U, __A, 5);
 }
 
+//
 __m512i test_mm512_maskz_srai_epi64_2(__mmask8 __U, __m512i __A, unsigned int __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srai_epi64_2
-  // CHECK: @llvm.x86.avx512.psrai.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_srai_epi64(__U, __A, __B); 
+  return _mm512_maskz_srai_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_sll_epi32(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_sll_epi32
-  // CHECK: @llvm.x86.avx512.psll.d.512
-  return _mm512_sll_epi32(__A, __B); 
+  return _mm512_sll_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_sll_epi32
-  // CHECK: @llvm.x86.avx512.psll.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_sll_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_sll_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sll_epi32
-  // CHECK: @llvm.x86.avx512.psll.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_sll_epi32(__U, __A, __B); 
+  return _mm512_maskz_sll_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_sll_epi64(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_sll_epi64
-  // CHECK: @llvm.x86.avx512.psll.q.512
-  return _mm512_sll_epi64(__A, __B); 
+  return _mm512_sll_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_sll_epi64
-  // CHECK: @llvm.x86.avx512.psll.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_sll_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_sll_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sll_epi64
-  // CHECK: @llvm.x86.avx512.psll.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_sll_epi64(__U, __A, __B); 
+  return _mm512_maskz_sll_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_sllv_epi32(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_sllv_epi32
-  // CHECK: @llvm.x86.avx512.psllv.d.512
-  return _mm512_sllv_epi32(__X, __Y); 
+  return _mm512_sllv_epi32(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_sllv_epi32
-  // CHECK: @llvm.x86.avx512.psllv.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_sllv_epi32(__W, __U, __X, __Y); 
+  return _mm512_mask_sllv_epi32(__W, __U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_sllv_epi32
-  // CHECK: @llvm.x86.avx512.psllv.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_sllv_epi32(__U, __X, __Y); 
+  return _mm512_maskz_sllv_epi32(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_sllv_epi64(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_sllv_epi64
-  // CHECK: @llvm.x86.avx512.psllv.q.512
-  return _mm512_sllv_epi64(__X, __Y); 
+  return _mm512_sllv_epi64(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_sllv_epi64
-  // CHECK: @llvm.x86.avx512.psllv.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_sllv_epi64(__W, __U, __X, __Y); 
+  return _mm512_mask_sllv_epi64(__W, __U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_sllv_epi64
-  // CHECK: @llvm.x86.avx512.psllv.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_sllv_epi64(__U, __X, __Y); 
+  return _mm512_maskz_sllv_epi64(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_sra_epi32(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_sra_epi32
-  // CHECK: @llvm.x86.avx512.psra.d.512
-  return _mm512_sra_epi32(__A, __B); 
+  return _mm512_sra_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_sra_epi32
-  // CHECK: @llvm.x86.avx512.psra.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_sra_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_sra_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sra_epi32
-  // CHECK: @llvm.x86.avx512.psra.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_sra_epi32(__U, __A, __B); 
+  return _mm512_maskz_sra_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_sra_epi64(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_sra_epi64
-  // CHECK: @llvm.x86.avx512.psra.q.512
-  return _mm512_sra_epi64(__A, __B); 
+  return _mm512_sra_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_sra_epi64
-  // CHECK: @llvm.x86.avx512.psra.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_sra_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_sra_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sra_epi64
-  // CHECK: @llvm.x86.avx512.psra.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_sra_epi64(__U, __A, __B); 
+  return _mm512_maskz_sra_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_srav_epi32(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_srav_epi32
-  // CHECK: @llvm.x86.avx512.psrav.d.512
-  return _mm512_srav_epi32(__X, __Y); 
+  return _mm512_srav_epi32(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_srav_epi32
-  // CHECK: @llvm.x86.avx512.psrav.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_srav_epi32(__W, __U, __X, __Y); 
+  return _mm512_mask_srav_epi32(__W, __U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_srav_epi32
-  // CHECK: @llvm.x86.avx512.psrav.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_srav_epi32(__U, __X, __Y); 
+  return _mm512_maskz_srav_epi32(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_srav_epi64(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_srav_epi64
-  // CHECK: @llvm.x86.avx512.psrav.q.512
-  return _mm512_srav_epi64(__X, __Y); 
+  return _mm512_srav_epi64(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_srav_epi64
-  // CHECK: @llvm.x86.avx512.psrav.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_srav_epi64(__W, __U, __X, __Y); 
+  return _mm512_mask_srav_epi64(__W, __U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_srav_epi64
-  // CHECK: @llvm.x86.avx512.psrav.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_srav_epi64(__U, __X, __Y); 
+  return _mm512_maskz_srav_epi64(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_srl_epi32(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_srl_epi32
-  // CHECK: @llvm.x86.avx512.psrl.d.512
-  return _mm512_srl_epi32(__A, __B); 
+  return _mm512_srl_epi32(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_srl_epi32
-  // CHECK: @llvm.x86.avx512.psrl.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_srl_epi32(__W, __U, __A, __B); 
+  return _mm512_mask_srl_epi32(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srl_epi32
-  // CHECK: @llvm.x86.avx512.psrl.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_srl_epi32(__U, __A, __B); 
+  return _mm512_maskz_srl_epi32(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_srl_epi64(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_srl_epi64
-  // CHECK: @llvm.x86.avx512.psrl.q.512
-  return _mm512_srl_epi64(__A, __B); 
+  return _mm512_srl_epi64(__A, __B);
 }
 
+//
 __m512i test_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_srl_epi64
-  // CHECK: @llvm.x86.avx512.psrl.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_srl_epi64(__W, __U, __A, __B); 
+  return _mm512_mask_srl_epi64(__W, __U, __A, __B);
 }
 
+//
 __m512i test_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_srl_epi64
-  // CHECK: @llvm.x86.avx512.psrl.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_srl_epi64(__U, __A, __B); 
+  return _mm512_maskz_srl_epi64(__U, __A, __B);
 }
 
+//
 __m512i test_mm512_srlv_epi32(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_srlv_epi32
-  // CHECK: @llvm.x86.avx512.psrlv.d.512
-  return _mm512_srlv_epi32(__X, __Y); 
+  return _mm512_srlv_epi32(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_srlv_epi32
-  // CHECK: @llvm.x86.avx512.psrlv.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_srlv_epi32(__W, __U, __X, __Y); 
+  return _mm512_mask_srlv_epi32(__W, __U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_srlv_epi32
-  // CHECK: @llvm.x86.avx512.psrlv.d.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_srlv_epi32(__U, __X, __Y); 
+  return _mm512_maskz_srlv_epi32(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_srlv_epi64(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_srlv_epi64
-  // CHECK: @llvm.x86.avx512.psrlv.q.512
-  return _mm512_srlv_epi64(__X, __Y); 
+  return _mm512_srlv_epi64(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_srlv_epi64
-  // CHECK: @llvm.x86.avx512.psrlv.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_srlv_epi64(__W, __U, __X, __Y); 
+  return _mm512_mask_srlv_epi64(__W, __U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_srlv_epi64
-  // CHECK: @llvm.x86.avx512.psrlv.q.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_srlv_epi64(__U, __X, __Y); 
+  return _mm512_maskz_srlv_epi64(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.pternlog.d.512({{.*}}, i32 240)
   return _mm512_ternarylogic_epi32(__A, __B, __C, _MM_TERNLOG_A);
 }
 
+//
 __m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.pternlog.d.512({{.*}}, i32 204)
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_ternarylogic_epi32(__A, __U, __B, __C, _MM_TERNLOG_B);
 }
 
+//
 __m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi32
-  // CHECK: @llvm.x86.avx512.pternlog.d.512({{.*}}, i32 170)
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_maskz_ternarylogic_epi32(__U, __A, __B, __C, _MM_TERNLOG_C);
 }
 
+//
 __m512i test_mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.pternlog.q.512({{.*}}, i32 192)
   return _mm512_ternarylogic_epi64(__A, __B, __C, _MM_TERNLOG_A & _MM_TERNLOG_B);
 }
 
+//
 __m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.pternlog.q.512({{.*}}, i32 238)
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_ternarylogic_epi64(__A, __U, __B, __C, _MM_TERNLOG_B | _MM_TERNLOG_C);
 }
 
+//
 __m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) {
-  // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi64
-  // CHECK: @llvm.x86.avx512.pternlog.q.512({{.*}}, i32 111)
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> zeroinitializer
   return _mm512_maskz_ternarylogic_epi64(__U, __A, __B, __C, ~_MM_TERNLOG_A | (_MM_TERNLOG_B ^ _MM_TERNLOG_C));
 }
 
+//
 __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_shuffle_f32x4
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  return _mm512_shuffle_f32x4(__A, __B, 4); 
+  return _mm512_shuffle_f32x4(__A, __B, 4);
 }
 
+//
 __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_f32x4
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); 
+  return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4);
 }
 
+//
 __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_f32x4
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); 
+  return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4);
 }
 
+//
 __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_shuffle_f64x2
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
-  return _mm512_shuffle_f64x2(__A, __B, 4); 
+  return _mm512_shuffle_f64x2(__A, __B, 4);
 }
 
+//
 __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_f64x2
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); 
+  return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4);
 }
 
+//
 __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_f64x2
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); 
+  return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4);
 }
 
+//
 __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_shuffle_i32x4
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  return _mm512_shuffle_i32x4(__A, __B, 4); 
+  return _mm512_shuffle_i32x4(__A, __B, 4);
 }
 
+//
 __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); 
+  return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4);
 }
 
+//
 __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); 
+  return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4);
 }
 
+//
 __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_shuffle_i64x2
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
-  return _mm512_shuffle_i64x2(__A, __B, 4); 
+  return _mm512_shuffle_i64x2(__A, __B, 4);
 }
 
+//
 __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_i64x2
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); 
+  return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4);
 }
 
+//
 __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_i64x2
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_shuffle_i64x2(__U, __A, __B, 4); 
+  return _mm512_maskz_shuffle_i64x2(__U, __A, __B, 4);
 }
 
+//
 __m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) {
-  // CHECK-LABEL: @test_mm512_shuffle_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
-  return _mm512_shuffle_pd(__M, __V, 4); 
+  return _mm512_shuffle_pd(__M, __V, 4);
 }
 
+//
 __m512d test_mm512_mask_shuffle_pd(__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_shuffle_pd(__W, __U, __M, __V, 4); 
+  return _mm512_mask_shuffle_pd(__W, __U, __M, __V, 4);
 }
 
+//
 __m512d test_mm512_maskz_shuffle_pd(__mmask8 __U, __m512d __M, __m512d __V) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_shuffle_pd(__U, __M, __V, 4); 
+  return _mm512_maskz_shuffle_pd(__U, __M, __V, 4);
 }
 
+//
 __m512 test_mm512_shuffle_ps(__m512 __M, __m512 __V) {
-  // CHECK-LABEL: @test_mm512_shuffle_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
-  return _mm512_shuffle_ps(__M, __V, 4); 
+  return _mm512_shuffle_ps(__M, __V, 4);
 }
 
+//
 __m512 test_mm512_mask_shuffle_ps(__m512 __W, __mmask16 __U, __m512 __M, __m512 __V) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_shuffle_ps(__W, __U, __M, __V, 4); 
+  return _mm512_mask_shuffle_ps(__W, __U, __M, __V, 4);
 }
 
+//
 __m512 test_mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4); 
+  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4);
 }
 
+//
 __m128d test_mm_sqrt_round_sd(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_sqrt_round_sd
-  // CHECK: call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 -1, i32 11)
   return _mm_sqrt_round_sd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_sqrt_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_sqrt_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: call double @llvm.sqrt.f64(double %{{.*}})
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 {{.*}}, double {{.*}}, double {{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double {{.*}}, i64 0
   return _mm_mask_sqrt_sd(__W,__U,__A,__B);
 }
 
+//
 __m128d test_mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_sqrt_round_sd
-  // CHECK: call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 11)
   return _mm_mask_sqrt_round_sd(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_sqrt_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_sqrt_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: call double @llvm.sqrt.f64(double %{{.*}})
-  // CHECK-NEXT: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 {{.*}}, double {{.*}}, double {{.*}}
-  // CHECK-NEXT: insertelement <2 x double> %{{.*}}, double {{.*}}, i64 0
   return _mm_maskz_sqrt_sd(__U,__A,__B);
 }
 
+//
 __m128d test_mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_sqrt_round_sd
-  // CHECK: call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 %{{.*}}, i32 11)
   return _mm_maskz_sqrt_round_sd(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_sqrt_round_ss(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_sqrt_round_ss
-  // CHECK: call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 -1, i32 11)
   return _mm_sqrt_round_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_sqrt_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_sqrt_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: call float @llvm.sqrt.f32(float %{{.*}})
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 {{.*}}, float {{.*}}, float {{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float {{.*}}, i64 0
   return _mm_mask_sqrt_ss(__W,__U,__A,__B);
 }
 
+//
 __m128 test_mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_sqrt_round_ss
-  // CHECK: call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 {{.*}}, i32 11)
   return _mm_mask_sqrt_round_ss(__W,__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_sqrt_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_sqrt_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: call float @llvm.sqrt.f32(float %{{.*}})
-  // CHECK-NEXT: extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: select i1 {{.*}}, float {{.*}}, float {{.*}}
-  // CHECK-NEXT: insertelement <4 x float> %{{.*}}, float {{.*}}, i64 0
   return _mm_maskz_sqrt_ss(__U,__A,__B);
 }
 
+//
 __m128 test_mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_sqrt_round_ss
-  // CHECK: call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 {{.*}}, i32 11)
   return _mm_maskz_sqrt_round_ss(__U,__A,__B,_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_broadcast_f32x4(float const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_f32x4
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  return _mm512_broadcast_f32x4(_mm_loadu_ps(__A)); 
+  return _mm512_broadcast_f32x4(_mm_loadu_ps(__A));
 }
 
+//
 __m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, float const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x4
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_broadcast_f32x4(__O, __M, _mm_loadu_ps(__A)); 
+  return _mm512_mask_broadcast_f32x4(__O, __M, _mm_loadu_ps(__A));
 }
 
+//
 __m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, float const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x4
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_broadcast_f32x4(__M, _mm_loadu_ps(__A)); 
+  return _mm512_maskz_broadcast_f32x4(__M, _mm_loadu_ps(__A));
 }
 
+//
 __m512d test_mm512_broadcast_f64x4(double const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_f64x4
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  return _mm512_broadcast_f64x4(_mm256_loadu_pd(__A)); 
+  return _mm512_broadcast_f64x4(_mm256_loadu_pd(__A));
 }
 
+//
 __m512d test_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, double const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_f64x4
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_broadcast_f64x4(__O, __M, _mm256_loadu_pd(__A)); 
+  return _mm512_mask_broadcast_f64x4(__O, __M, _mm256_loadu_pd(__A));
 }
 
+//
 __m512d test_mm512_maskz_broadcast_f64x4(__mmask8 __M, double const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_f64x4
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_broadcast_f64x4(__M, _mm256_loadu_pd(__A)); 
+  return _mm512_maskz_broadcast_f64x4(__M, _mm256_loadu_pd(__A));
 }
 
+//
 __m512i test_mm512_broadcast_i32x4(__m128i const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_i32x4
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  return _mm512_broadcast_i32x4(_mm_loadu_si128(__A)); 
+  return _mm512_broadcast_i32x4(_mm_loadu_si128(__A));
 }
 
+//
 __m512i test_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x4
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_broadcast_i32x4(__O, __M, _mm_loadu_si128(__A)); 
+  return _mm512_mask_broadcast_i32x4(__O, __M, _mm_loadu_si128(__A));
 }
 
+//
 __m512i test_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x4
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A)); 
+  return _mm512_maskz_broadcast_i32x4(__M, _mm_loadu_si128(__A));
 }
 
+//
 __m512i test_mm512_broadcast_i64x4(__m256i const* __A) {
-  // CHECK-LABEL: @test_mm512_broadcast_i64x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  return _mm512_broadcast_i64x4(_mm256_loadu_si256(__A)); 
+  return _mm512_broadcast_i64x4(_mm256_loadu_si256(__A));
 }
 
+//
 __m512i test_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i const* __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcast_i64x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_broadcast_i64x4(__O, __M, _mm256_loadu_si256(__A)); 
+  return _mm512_mask_broadcast_i64x4(__O, __M, _mm256_loadu_si256(__A));
 }
 
+//
 __m512i test_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i const* __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcast_i64x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_broadcast_i64x4(__M, _mm256_loadu_si256(__A)); 
+  return _mm512_maskz_broadcast_i64x4(__M, _mm256_loadu_si256(__A));
 }
 
+//
 __m512d test_mm512_broadcastsd_pd(__m128d __A) {
-  // CHECK-LABEL: @test_mm512_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> zeroinitializer
   return _mm512_broadcastsd_pd(__A);
 }
 
+//
 __m512d test_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> zeroinitializer
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_broadcastsd_pd(__O, __M, __A);
 }
 
+//
 __m512d test_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> zeroinitializer
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_broadcastsd_pd(__M, __A);
 }
 
+//
 __m512 test_mm512_broadcastss_ps(__m128 __A) {
-  // CHECK-LABEL: @test_mm512_broadcastss_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_broadcastss_ps(__A);
 }
 
+//
 __m512 test_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcastss_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> zeroinitializer
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_broadcastss_ps(__O, __M, __A);
 }
 
+//
 __m512 test_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcastss_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> zeroinitializer
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_broadcastss_ps(__M, __A);
 }
 
+//
 __m512i test_mm512_broadcastd_epi32(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_broadcastd_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> zeroinitializer
   return _mm512_broadcastd_epi32(__A);
 }
 
+//
 __m512i test_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcastd_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> zeroinitializer
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_broadcastd_epi32(__O, __M, __A);
 }
 
+//
 __m512i test_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcastd_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> zeroinitializer
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_broadcastd_epi32(__M, __A);
 }
 
+//
 __m512i test_mm512_broadcastq_epi64(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_broadcastq_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> zeroinitializer
   return _mm512_broadcastq_epi64(__A);
 }
 
+//
 __m512i test_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_mask_broadcastq_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> zeroinitializer
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_broadcastq_epi64(__O, __M, __A);
 }
 
+//
 __m512i test_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_broadcastq_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> zeroinitializer
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_broadcastq_epi64(__M, __A);
 }
 
+//
 __m128i test_mm512_cvtsepi32_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtsepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
-  return _mm512_cvtsepi32_epi8(__A); 
+  return _mm512_cvtsepi32_epi8(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtsepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
-  return _mm512_mask_cvtsepi32_epi8(__O, __M, __A); 
+  return _mm512_mask_cvtsepi32_epi8(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtsepi32_epi8(__mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtsepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
-  return _mm512_maskz_cvtsepi32_epi8(__M, __A); 
+  return _mm512_maskz_cvtsepi32_epi8(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtsepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_storeu_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.db.mem.512
-  return _mm512_mask_cvtsepi32_storeu_epi8(__P, __M, __A); 
+  return _mm512_mask_cvtsepi32_storeu_epi8(__P, __M, __A);
 }
 
+//
 __m256i test_mm512_cvtsepi32_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtsepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
-  return _mm512_cvtsepi32_epi16(__A); 
+  return _mm512_cvtsepi32_epi16(__A);
 }
 
+//
 __m256i test_mm512_mask_cvtsepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
-  return _mm512_mask_cvtsepi32_epi16(__O, __M, __A); 
+  return _mm512_mask_cvtsepi32_epi16(__O, __M, __A);
 }
 
+//
 __m256i test_mm512_maskz_cvtsepi32_epi16(__mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtsepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
-  return _mm512_maskz_cvtsepi32_epi16(__M, __A); 
+  return _mm512_maskz_cvtsepi32_epi16(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtsepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_storeu_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.mem.512
-  return _mm512_mask_cvtsepi32_storeu_epi16(__P, __M, __A); 
+  return _mm512_mask_cvtsepi32_storeu_epi16(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtsepi64_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtsepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
-  return _mm512_cvtsepi64_epi8(__A); 
+  return _mm512_cvtsepi64_epi8(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
-  return _mm512_mask_cvtsepi64_epi8(__O, __M, __A); 
+  return _mm512_mask_cvtsepi64_epi8(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtsepi64_epi8(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
-  return _mm512_maskz_cvtsepi64_epi8(__M, __A); 
+  return _mm512_maskz_cvtsepi64_epi8(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtsepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.mem.512
-  return _mm512_mask_cvtsepi64_storeu_epi8(__P, __M, __A); 
+  return _mm512_mask_cvtsepi64_storeu_epi8(__P, __M, __A);
 }
 
+//
 __m256i test_mm512_cvtsepi64_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtsepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
-  return _mm512_cvtsepi64_epi32(__A); 
+  return _mm512_cvtsepi64_epi32(__A);
 }
 
+//
 __m256i test_mm512_mask_cvtsepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
-  return _mm512_mask_cvtsepi64_epi32(__O, __M, __A); 
+  return _mm512_mask_cvtsepi64_epi32(__O, __M, __A);
 }
 
+//
 __m256i test_mm512_maskz_cvtsepi64_epi32(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
-  return _mm512_maskz_cvtsepi64_epi32(__M, __A); 
+  return _mm512_maskz_cvtsepi64_epi32(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.mem.512
-  return _mm512_mask_cvtsepi64_storeu_epi32(__P, __M, __A); 
+  return _mm512_mask_cvtsepi64_storeu_epi32(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtsepi64_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtsepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
-  return _mm512_cvtsepi64_epi16(__A); 
+  return _mm512_cvtsepi64_epi16(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
-  return _mm512_mask_cvtsepi64_epi16(__O, __M, __A); 
+  return _mm512_mask_cvtsepi64_epi16(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtsepi64_epi16(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
-  return _mm512_maskz_cvtsepi64_epi16(__M, __A); 
+  return _mm512_maskz_cvtsepi64_epi16(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtsepi64_storeu_epi16(void * __P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.mem.512
-  return _mm512_mask_cvtsepi64_storeu_epi16(__P, __M, __A); 
+  return _mm512_mask_cvtsepi64_storeu_epi16(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtusepi32_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtusepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
-  return _mm512_cvtusepi32_epi8(__A); 
+  return _mm512_cvtusepi32_epi8(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtusepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
-  return _mm512_mask_cvtusepi32_epi8(__O, __M, __A); 
+  return _mm512_mask_cvtusepi32_epi8(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtusepi32_epi8(__mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtusepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
-  return _mm512_maskz_cvtusepi32_epi8(__M, __A); 
+  return _mm512_maskz_cvtusepi32_epi8(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtusepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_storeu_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.db.mem.512
-  return _mm512_mask_cvtusepi32_storeu_epi8(__P, __M, __A); 
+  return _mm512_mask_cvtusepi32_storeu_epi8(__P, __M, __A);
 }
 
+//
 __m256i test_mm512_cvtusepi32_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtusepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
-  return _mm512_cvtusepi32_epi16(__A); 
+  return _mm512_cvtusepi32_epi16(__A);
 }
 
+//
 __m256i test_mm512_mask_cvtusepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
-  return _mm512_mask_cvtusepi32_epi16(__O, __M, __A); 
+  return _mm512_mask_cvtusepi32_epi16(__O, __M, __A);
 }
 
+//
 __m256i test_mm512_maskz_cvtusepi32_epi16(__mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtusepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
-  return _mm512_maskz_cvtusepi32_epi16(__M, __A); 
+  return _mm512_maskz_cvtusepi32_epi16(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtusepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_storeu_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.mem.512
-  return _mm512_mask_cvtusepi32_storeu_epi16(__P, __M, __A); 
+  return _mm512_mask_cvtusepi32_storeu_epi16(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtusepi64_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtusepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
-  return _mm512_cvtusepi64_epi8(__A); 
+  return _mm512_cvtusepi64_epi8(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
-  return _mm512_mask_cvtusepi64_epi8(__O, __M, __A); 
+  return _mm512_mask_cvtusepi64_epi8(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtusepi64_epi8(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
-  return _mm512_maskz_cvtusepi64_epi8(__M, __A); 
+  return _mm512_maskz_cvtusepi64_epi8(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtusepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.mem.512
-  return _mm512_mask_cvtusepi64_storeu_epi8(__P, __M, __A); 
+  return _mm512_mask_cvtusepi64_storeu_epi8(__P, __M, __A);
 }
 
+//
 __m256i test_mm512_cvtusepi64_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtusepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
-  return _mm512_cvtusepi64_epi32(__A); 
+  return _mm512_cvtusepi64_epi32(__A);
 }
 
+//
 __m256i test_mm512_mask_cvtusepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
-  return _mm512_mask_cvtusepi64_epi32(__O, __M, __A); 
+  return _mm512_mask_cvtusepi64_epi32(__O, __M, __A);
 }
 
+//
 __m256i test_mm512_maskz_cvtusepi64_epi32(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
-  return _mm512_maskz_cvtusepi64_epi32(__M, __A); 
+  return _mm512_maskz_cvtusepi64_epi32(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtusepi64_storeu_epi32(void* __P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.mem.512
-  return _mm512_mask_cvtusepi64_storeu_epi32(__P, __M, __A); 
+  return _mm512_mask_cvtusepi64_storeu_epi32(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtusepi64_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtusepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
-  return _mm512_cvtusepi64_epi16(__A); 
+  return _mm512_cvtusepi64_epi16(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
-  return _mm512_mask_cvtusepi64_epi16(__O, __M, __A); 
+  return _mm512_mask_cvtusepi64_epi16(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtusepi64_epi16(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
-  return _mm512_maskz_cvtusepi64_epi16(__M, __A); 
+  return _mm512_maskz_cvtusepi64_epi16(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.mem.512
-  return _mm512_mask_cvtusepi64_storeu_epi16(__P, __M, __A); 
+  return _mm512_mask_cvtusepi64_storeu_epi16(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtepi32_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi32_epi8
-  // CHECK: trunc <16 x i32> %{{.*}} to <16 x i8>
-  return _mm512_cvtepi32_epi8(__A); 
+  return _mm512_cvtepi32_epi8(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
-  return _mm512_mask_cvtepi32_epi8(__O, __M, __A); 
+  return _mm512_mask_cvtepi32_epi8(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtepi32_epi8(__mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
-  return _mm512_maskz_cvtepi32_epi8(__M, __A); 
+  return _mm512_maskz_cvtepi32_epi8(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_storeu_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.db.mem.512
-  return _mm512_mask_cvtepi32_storeu_epi8(__P, __M, __A); 
+  return _mm512_mask_cvtepi32_storeu_epi8(__P, __M, __A);
 }
 
+//
 __m256i test_mm512_cvtepi32_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi32_epi16
-  // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16>
-  return _mm512_cvtepi32_epi16(__A); 
+  return _mm512_cvtepi32_epi16(__A);
 }
 
+//
 __m256i test_mm512_mask_cvtepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
-  return _mm512_mask_cvtepi32_epi16(__O, __M, __A); 
+  return _mm512_mask_cvtepi32_epi16(__O, __M, __A);
 }
 
+//
 __m256i test_mm512_maskz_cvtepi32_epi16(__mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
-  return _mm512_maskz_cvtepi32_epi16(__M, __A); 
+  return _mm512_maskz_cvtepi32_epi16(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtepi32_storeu_epi16(void * __P, __mmask16 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_storeu_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.dw.mem.512
-  return _mm512_mask_cvtepi32_storeu_epi16(__P, __M, __A); 
+  return _mm512_mask_cvtepi32_storeu_epi16(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtepi64_epi8(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
-  return _mm512_cvtepi64_epi8(__A); 
+  return _mm512_cvtepi64_epi8(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
-  return _mm512_mask_cvtepi64_epi8(__O, __M, __A); 
+  return _mm512_mask_cvtepi64_epi8(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtepi64_epi8(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
-  return _mm512_maskz_cvtepi64_epi8(__M, __A); 
+  return _mm512_maskz_cvtepi64_epi8(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi8
-  // CHECK: @llvm.x86.avx512.mask.pmov.qb.mem.512
-  return _mm512_mask_cvtepi64_storeu_epi8(__P, __M, __A); 
+  return _mm512_mask_cvtepi64_storeu_epi8(__P, __M, __A);
 }
 
+//
 __m256i test_mm512_cvtepi64_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi64_epi32
-  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
-  return _mm512_cvtepi64_epi32(__A); 
+  return _mm512_cvtepi64_epi32(__A);
 }
 
+//
 __m256i test_mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi32
-  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm512_mask_cvtepi64_epi32(__O, __M, __A); 
+  return _mm512_mask_cvtepi64_epi32(__O, __M, __A);
 }
 
+//
 __m256i test_mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi32
-  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i32>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm512_maskz_cvtepi64_epi32(__M, __A); 
+  return _mm512_maskz_cvtepi64_epi32(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtepi64_storeu_epi32(void* __P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi32
-  // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.512
-  return _mm512_mask_cvtepi64_storeu_epi32(__P, __M, __A); 
+  return _mm512_mask_cvtepi64_storeu_epi32(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_cvtepi64_epi16(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_cvtepi64_epi16
-  // CHECK: trunc <8 x i64> %{{.*}} to <8 x i16>
-  return _mm512_cvtepi64_epi16(__A); 
+  return _mm512_cvtepi64_epi16(__A);
 }
 
+//
 __m128i test_mm512_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
-  return _mm512_mask_cvtepi64_epi16(__O, __M, __A); 
+  return _mm512_mask_cvtepi64_epi16(__O, __M, __A);
 }
 
+//
 __m128i test_mm512_maskz_cvtepi64_epi16(__mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
-  return _mm512_maskz_cvtepi64_epi16(__M, __A); 
+  return _mm512_maskz_cvtepi64_epi16(__M, __A);
 }
 
+//
 void test_mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi16
-  // CHECK: @llvm.x86.avx512.mask.pmov.qw.mem.512
-  return _mm512_mask_cvtepi64_storeu_epi16(__P, __M, __A); 
+  return _mm512_mask_cvtepi64_storeu_epi16(__P, __M, __A);
 }
 
+//
 __m128i test_mm512_extracti32x4_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  return _mm512_extracti32x4_epi32(__A, 3); 
+  return _mm512_extracti32x4_epi32(__A, 3);
 }
 
+//
 __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); 
+  return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3);
 }
 
+//
 __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_extracti32x4_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); 
+  return _mm512_maskz_extracti32x4_epi32(__U, __A, 3);
 }
 
+//
 __m256i test_mm512_extracti64x4_epi64(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  return _mm512_extracti64x4_epi64(__A, 1); 
+  return _mm512_extracti64x4_epi64(__A, 1);
 }
 
+//
 __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); 
+  return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1);
 }
 
+//
 __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_extracti64x4_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
-  return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); 
+  return _mm512_maskz_extracti64x4_epi64(__U, __A, 1);
 }
 
+//
 __m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm512_insertf64x4
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
   return _mm512_insertf64x4(__A, __B, 1);
 }
 
+//
 __m512d test_mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm512_mask_insertf64x4
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_insertf64x4(__W, __U, __A, __B, 1); 
+  return _mm512_mask_insertf64x4(__W, __U, __A, __B, 1);
 }
 
+//
 __m512d test_mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B) {
-  // CHECK-LABEL: @test_mm512_maskz_insertf64x4
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_insertf64x4(__U, __A, __B, 1); 
+  return _mm512_maskz_insertf64x4(__U, __A, __B, 1);
 }
 
+//
 __m512i test_mm512_inserti64x4(__m512i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm512_inserti64x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  return _mm512_inserti64x4(__A, __B, 1); 
+  return _mm512_inserti64x4(__A, __B, 1);
 }
 
+//
 __m512i test_mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm512_mask_inserti64x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_inserti64x4(__W, __U, __A, __B, 1); 
+  return _mm512_mask_inserti64x4(__W, __U, __A, __B, 1);
 }
 
+//
 __m512i test_mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_inserti64x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_inserti64x4(__U, __A, __B, 1); 
+  return _mm512_maskz_inserti64x4(__U, __A, __B, 1);
 }
 
+//
 __m512 test_mm512_insertf32x4(__m512 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm512_insertf32x4
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_insertf32x4(__A, __B, 1);
 }
 
+//
 __m512 test_mm512_mask_insertf32x4(__m512 __W, __mmask16 __U, __m512 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm512_mask_insertf32x4
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_insertf32x4(__W, __U, __A, __B, 1); 
+  return _mm512_mask_insertf32x4(__W, __U, __A, __B, 1);
 }
 
+//
 __m512 test_mm512_maskz_insertf32x4(__mmask16 __U, __m512 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm512_maskz_insertf32x4
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_insertf32x4(__U, __A, __B, 1); 
+  return _mm512_maskz_insertf32x4(__U, __A, __B, 1);
 }
 
+//
 __m512i test_mm512_inserti32x4(__m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_inserti32x4
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  return _mm512_inserti32x4(__A, __B, 1); 
+  return _mm512_inserti32x4(__A, __B, 1);
 }
 
+//
 __m512i test_mm512_mask_inserti32x4(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_mask_inserti32x4
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_inserti32x4(__W, __U, __A, __B, 1); 
+  return _mm512_mask_inserti32x4(__W, __U, __A, __B, 1);
 }
 
+//
 __m512i test_mm512_maskz_inserti32x4(__mmask16 __U, __m512i __A, __m128i __B) {
-  // CHECK-LABEL: @test_mm512_maskz_inserti32x4
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_inserti32x4(__U, __A, __B, 1); 
+  return _mm512_maskz_inserti32x4(__U, __A, __B, 1);
 }
 
+//
 __m512d test_mm512_getmant_round_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_getmant_round_pd
-  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
   return _mm512_getmant_round_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_mask_getmant_round_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_getmant_round_pd
-  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
   return _mm512_mask_getmant_round_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_getmant_round_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getmant_round_pd
-  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
   return _mm512_maskz_getmant_round_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_getmant_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_getmant_pd
-  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
-  return _mm512_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+  return _mm512_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m512d test_mm512_mask_getmant_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_getmant_pd
-  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
-  return _mm512_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+  return _mm512_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m512d test_mm512_maskz_getmant_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getmant_pd
-  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
-  return _mm512_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+  return _mm512_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m512 test_mm512_getmant_round_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_getmant_round_ps
-  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
   return _mm512_getmant_round_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_getmant_round_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_getmant_round_ps
-  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
   return _mm512_mask_getmant_round_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_getmant_round_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getmant_round_ps
-  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
   return _mm512_maskz_getmant_round_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_getmant_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_getmant_ps
-  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
-  return _mm512_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+  return _mm512_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m512 test_mm512_mask_getmant_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_getmant_ps
-  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
-  return _mm512_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+  return _mm512_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m512 test_mm512_maskz_getmant_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getmant_ps
-  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
-  return _mm512_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+  return _mm512_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m512d test_mm512_getexp_round_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_getexp_round_pd
-  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
   return _mm512_getexp_round_pd(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_mask_getexp_round_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_getexp_round_pd
-  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
   return _mm512_mask_getexp_round_pd(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_getexp_round_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getexp_round_pd
-  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
   return _mm512_maskz_getexp_round_pd(__U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_getexp_pd(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_getexp_pd
-  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
-  return _mm512_getexp_pd(__A); 
+  return _mm512_getexp_pd(__A);
 }
 
+//
 __m512d test_mm512_mask_getexp_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_getexp_pd
-  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
-  return _mm512_mask_getexp_pd(__W, __U, __A); 
+  return _mm512_mask_getexp_pd(__W, __U, __A);
 }
 
+//
 __m512d test_mm512_maskz_getexp_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getexp_pd
-  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
-  return _mm512_maskz_getexp_pd(__U, __A); 
+  return _mm512_maskz_getexp_pd(__U, __A);
 }
 
+//
 __m512 test_mm512_getexp_round_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_getexp_round_ps
-  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
   return _mm512_getexp_round_ps(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_getexp_round_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_getexp_round_ps
-  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
   return _mm512_mask_getexp_round_ps(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_getexp_round_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getexp_round_ps
-  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
   return _mm512_maskz_getexp_round_ps(__U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_getexp_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_getexp_ps
-  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
-  return _mm512_getexp_ps(__A); 
+  return _mm512_getexp_ps(__A);
 }
 
+//
 __m512 test_mm512_mask_getexp_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_getexp_ps
-  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
-  return _mm512_mask_getexp_ps(__W, __U, __A); 
+  return _mm512_mask_getexp_ps(__W, __U, __A);
 }
 
+//
 __m512 test_mm512_maskz_getexp_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getexp_ps
-  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
-  return _mm512_maskz_getexp_ps(__U, __A); 
+  return _mm512_maskz_getexp_ps(__U, __A);
 }
 
+//
 __m256 test_mm512_i64gather_ps(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i64gather_ps
-  // CHECK: @llvm.x86.avx512.mask.gather.qps.512
-  return _mm512_i64gather_ps(__index, __addr, 2); 
+  return _mm512_i64gather_ps(__index, __addr, 2);
 }
 
+//
 __m256 test_mm512_mask_i64gather_ps(__m256 __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i64gather_ps
-  // CHECK: @llvm.x86.avx512.mask.gather.qps.512
-  return _mm512_mask_i64gather_ps(__v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i64gather_ps(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 __m256i test_mm512_i64gather_epi32(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i64gather_epi32
-  // CHECK: @llvm.x86.avx512.mask.gather.qpi.512
-  return _mm512_i64gather_epi32(__index, __addr, 2); 
+  return _mm512_i64gather_epi32(__index, __addr, 2);
 }
 
+//
 __m256i test_mm512_mask_i64gather_epi32(__m256i __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i64gather_epi32
-  // CHECK: @llvm.x86.avx512.mask.gather.qpi.512
-  return _mm512_mask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 __m512d test_mm512_i64gather_pd(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i64gather_pd
-  // CHECK: @llvm.x86.avx512.mask.gather.qpd.512
-  return _mm512_i64gather_pd(__index, __addr, 2); 
+  return _mm512_i64gather_pd(__index, __addr, 2);
 }
 
+//
 __m512d test_mm512_mask_i64gather_pd(__m512d __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i64gather_pd
-  // CHECK: @llvm.x86.avx512.mask.gather.qpd.512
-  return _mm512_mask_i64gather_pd(__v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i64gather_pd(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 __m512i test_mm512_i64gather_epi64(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i64gather_epi64
-  // CHECK: @llvm.x86.avx512.mask.gather.qpq.512
-  return _mm512_i64gather_epi64(__index, __addr, 2); 
+  return _mm512_i64gather_epi64(__index, __addr, 2);
 }
 
+//
 __m512i test_mm512_mask_i64gather_epi64(__m512i __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i64gather_epi64
-  // CHECK: @llvm.x86.avx512.mask.gather.qpq.512
-  return _mm512_mask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 __m512 test_mm512_i32gather_ps(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i32gather_ps
-  // CHECK: @llvm.x86.avx512.mask.gather.dps.512
-  return _mm512_i32gather_ps(__index, __addr, 2); 
+  return _mm512_i32gather_ps(__index, __addr, 2);
 }
 
+//
 __m512 test_mm512_mask_i32gather_ps(__m512 v1_old, __mmask16 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i32gather_ps
-  // CHECK: @llvm.x86.avx512.mask.gather.dps.512
-  return _mm512_mask_i32gather_ps(v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i32gather_ps(v1_old, __mask, __index, __addr, 2);
 }
 
+//
 __m512i test_mm512_i32gather_epi32(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i32gather_epi32
-  // CHECK: @llvm.x86.avx512.mask.gather.dpi.512
-  return _mm512_i32gather_epi32(__index, __addr, 2); 
+  return _mm512_i32gather_epi32(__index, __addr, 2);
 }
 
+//
 __m512i test_mm512_mask_i32gather_epi32(__m512i __v1_old, __mmask16 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i32gather_epi32
-  // CHECK: @llvm.x86.avx512.mask.gather.dpi.512
-  return _mm512_mask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 __m512d test_mm512_i32gather_pd(__m256i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i32gather_pd
-  // CHECK: @llvm.x86.avx512.mask.gather.dpd.512
-  return _mm512_i32gather_pd(__index, __addr, 2); 
+  return _mm512_i32gather_pd(__index, __addr, 2);
 }
 
+//
 __m512d test_mm512_mask_i32gather_pd(__m512d __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i32gather_pd
-  // CHECK: @llvm.x86.avx512.mask.gather.dpd.512
-  return _mm512_mask_i32gather_pd(__v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i32gather_pd(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 __m512i test_mm512_i32gather_epi64(__m256i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i32gather_epi64
-  // CHECK: @llvm.x86.avx512.mask.gather.dpq.512
-  return _mm512_i32gather_epi64(__index, __addr, 2); 
+  return _mm512_i32gather_epi64(__index, __addr, 2);
 }
 
+//
 __m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i32gather_epi64
-  // CHECK: @llvm.x86.avx512.mask.gather.dpq.512
-  return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+  return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 void test_mm512_i64scatter_ps(void *__addr, __m512i __index, __m256 __v1) {
-  // CHECK-LABEL: @test_mm512_i64scatter_ps
-  // CHECK: @llvm.x86.avx512.mask.scatter.qps.512
-  return _mm512_i64scatter_ps(__addr, __index, __v1, 2); 
+  return _mm512_i64scatter_ps(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m512i __index, __m256 __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i64scatter_ps
-  // CHECK: @llvm.x86.avx512.mask.scatter.qps.512
-  return _mm512_mask_i64scatter_ps(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i64scatter_ps(__addr, __mask, __index, __v1, 2);
 }
 
+//
 void test_mm512_i64scatter_epi32(void *__addr, __m512i __index, __m256i __v1) {
-  // CHECK-LABEL: @test_mm512_i64scatter_epi32
-  // CHECK: @llvm.x86.avx512.mask.scatter.qpi.512
-  return _mm512_i64scatter_epi32(__addr, __index, __v1, 2); 
+  return _mm512_i64scatter_epi32(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m512i __index, __m256i __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i64scatter_epi32
-  // CHECK: @llvm.x86.avx512.mask.scatter.qpi.512
-  return _mm512_mask_i64scatter_epi32(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i64scatter_epi32(__addr, __mask, __index, __v1, 2);
 }
 
+//
 void test_mm512_i64scatter_pd(void *__addr, __m512i __index, __m512d __v1) {
-  // CHECK-LABEL: @test_mm512_i64scatter_pd
-  // CHECK: @llvm.x86.avx512.mask.scatter.qpd.512
-  return _mm512_i64scatter_pd(__addr, __index, __v1, 2); 
+  return _mm512_i64scatter_pd(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m512i __index, __m512d __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i64scatter_pd
-  // CHECK: @llvm.x86.avx512.mask.scatter.qpd.512
-  return _mm512_mask_i64scatter_pd(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i64scatter_pd(__addr, __mask, __index, __v1, 2);
 }
 
+//
 void test_mm512_i64scatter_epi64(void *__addr, __m512i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_i64scatter_epi64
-  // CHECK: @llvm.x86.avx512.mask.scatter.qpq.512
-  return _mm512_i64scatter_epi64(__addr, __index, __v1, 2); 
+  return _mm512_i64scatter_epi64(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m512i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i64scatter_epi64
-  // CHECK: @llvm.x86.avx512.mask.scatter.qpq.512
-  return _mm512_mask_i64scatter_epi64(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i64scatter_epi64(__addr, __mask, __index, __v1, 2);
 }
 
+//
 void test_mm512_i32scatter_ps(void *__addr, __m512i __index, __m512 __v1) {
-  // CHECK-LABEL: @test_mm512_i32scatter_ps
-  // CHECK: @llvm.x86.avx512.mask.scatter.dps.512
-  return _mm512_i32scatter_ps(__addr, __index, __v1, 2); 
+  return _mm512_i32scatter_ps(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i32scatter_ps(void *__addr, __mmask16 __mask, __m512i __index, __m512 __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i32scatter_ps
-  // CHECK: @llvm.x86.avx512.mask.scatter.dps.512
-  return _mm512_mask_i32scatter_ps(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i32scatter_ps(__addr, __mask, __index, __v1, 2);
 }
 
+//
 void test_mm512_i32scatter_epi32(void *__addr, __m512i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_i32scatter_epi32
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpi.512
-  return _mm512_i32scatter_epi32(__addr, __index, __v1, 2); 
+  return _mm512_i32scatter_epi32(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i32scatter_epi32(void *__addr, __mmask16 __mask, __m512i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i32scatter_epi32
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpi.512
-  return _mm512_mask_i32scatter_epi32(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i32scatter_epi32(__addr, __mask, __index, __v1, 2);
 }
 
+//
 void test_mm512_i32scatter_pd(void *__addr, __m256i __index, __m512d __v1) {
-  // CHECK-LABEL: @test_mm512_i32scatter_pd
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpd.512
-  return _mm512_i32scatter_pd(__addr, __index, __v1, 2); 
+  return _mm512_i32scatter_pd(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m256i __index, __m512d __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i32scatter_pd
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpd.512
-  return _mm512_mask_i32scatter_pd(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i32scatter_pd(__addr, __mask, __index, __v1, 2);
 }
 
+//
 void test_mm512_i32scatter_epi64(void *__addr, __m256i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_i32scatter_epi64
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
-  return _mm512_i32scatter_epi64(__addr, __index, __v1, 2); 
+  return _mm512_i32scatter_epi64(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m256i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i32scatter_epi64
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
-  return _mm512_mask_i32scatter_epi64(__addr, __mask, __index, __v1, 2); 
+  return _mm512_mask_i32scatter_epi64(__addr, __mask, __index, __v1, 2);
 }
 
+//
 __m128d test_mm_mask_rsqrt14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_rsqrt14_sd
-  // CHECK: @llvm.x86.avx512.rsqrt14.sd
   return _mm_mask_rsqrt14_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_maskz_rsqrt14_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_rsqrt14_sd
-  // CHECK: @llvm.x86.avx512.rsqrt14.sd
   return _mm_maskz_rsqrt14_sd(__U, __A, __B);
 }
 
+//
 __m128 test_mm_mask_rsqrt14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_rsqrt14_ss
-  // CHECK: @llvm.x86.avx512.rsqrt14.ss
   return _mm_mask_rsqrt14_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_maskz_rsqrt14_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_rsqrt14_ss
-  // CHECK: @llvm.x86.avx512.rsqrt14.ss
   return _mm_maskz_rsqrt14_ss(__U, __A, __B);
 }
 
+//
 __m512d test_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_rcp14_pd 
-  // CHECK: @llvm.x86.avx512.rcp14.pd.512
   return _mm512_mask_rcp14_pd (__W,__U,__A);
 }
 
+//
 __m512d test_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_rcp14_pd 
-  // CHECK: @llvm.x86.avx512.rcp14.pd.512
   return _mm512_maskz_rcp14_pd (__U,__A);
 }
 
+//
 __m512 test_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_rcp14_ps 
-  // CHECK: @llvm.x86.avx512.rcp14.ps.512
   return _mm512_mask_rcp14_ps (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_rcp14_ps 
-  // CHECK: @llvm.x86.avx512.rcp14.ps.512
   return _mm512_maskz_rcp14_ps (__U,__A);
 }
 
+//
 __m128d test_mm_mask_rcp14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_rcp14_sd
-  // CHECK: @llvm.x86.avx512.rcp14.sd
   return _mm_mask_rcp14_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_maskz_rcp14_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_rcp14_sd
-  // CHECK: @llvm.x86.avx512.rcp14.sd
   return _mm_maskz_rcp14_sd(__U, __A, __B);
 }
 
+//
 __m128 test_mm_mask_rcp14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_rcp14_ss
-  // CHECK: @llvm.x86.avx512.rcp14.ss
   return _mm_mask_rcp14_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_maskz_rcp14_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_rcp14_ss
-  // CHECK: @llvm.x86.avx512.rcp14.ss
   return _mm_maskz_rcp14_ss(__U, __A, __B);
 }
 
+//
 __m128d test_mm_mask_getexp_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_getexp_sd
-  // CHECK: @llvm.x86.avx512.mask.getexp.sd
   return _mm_mask_getexp_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_mask_getexp_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_getexp_round_sd
-  // CHECK: @llvm.x86.avx512.mask.getexp.sd
   return _mm_mask_getexp_round_sd(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_getexp_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_getexp_sd
-  // CHECK: @llvm.x86.avx512.mask.getexp.sd
   return _mm_maskz_getexp_sd(__U, __A, __B);
 }
 
+//
 __m128d test_mm_maskz_getexp_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_getexp_round_sd
-  // CHECK: @llvm.x86.avx512.mask.getexp.sd
   return _mm_maskz_getexp_round_sd(__U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_getexp_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_getexp_ss
-  // CHECK: @llvm.x86.avx512.mask.getexp.ss
   return _mm_mask_getexp_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_mask_getexp_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_getexp_round_ss
-  // CHECK: @llvm.x86.avx512.mask.getexp.ss
   return _mm_mask_getexp_round_ss(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_getexp_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_getexp_ss
-  // CHECK: @llvm.x86.avx512.mask.getexp.ss
   return _mm_maskz_getexp_ss(__U, __A, __B);
 }
 
+//
 __m128 test_mm_maskz_getexp_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_getexp_round_ss
-  // CHECK: @llvm.x86.avx512.mask.getexp.ss
   return _mm_maskz_getexp_round_ss(__U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_getmant_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_getmant_sd
-  // CHECK: @llvm.x86.avx512.mask.getmant.sd
   return _mm_mask_getmant_sd(__W, __U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m128d test_mm_mask_getmant_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_getmant_round_sd
-  // CHECK: @llvm.x86.avx512.mask.getmant.sd
   return _mm_mask_getmant_round_sd(__W, __U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_getmant_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_getmant_sd
-  // CHECK: @llvm.x86.avx512.mask.getmant.sd
   return _mm_maskz_getmant_sd(__U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m128d test_mm_maskz_getmant_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_maskz_getmant_round_sd
-  // CHECK: @llvm.x86.avx512.mask.getmant.sd
   return _mm_maskz_getmant_round_sd(__U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_getmant_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_getmant_ss
-  // CHECK: @llvm.x86.avx512.mask.getmant.ss
   return _mm_mask_getmant_ss(__W, __U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m128 test_mm_mask_getmant_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_getmant_round_ss
-  // CHECK: @llvm.x86.avx512.mask.getmant.ss
   return _mm_mask_getmant_round_ss(__W, __U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_getmant_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_getmant_ss
-  // CHECK: @llvm.x86.avx512.mask.getmant.ss
   return _mm_maskz_getmant_ss(__U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+//
 __m128 test_mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_maskz_getmant_round_ss
-  // CHECK: @llvm.x86.avx512.mask.getmant.ss
   return _mm_maskz_getmant_round_ss(__U, __A, __B, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fmadd_ss
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fmadd_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_fmadd_round_ss(__m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_fmadd_round_ss
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
   return _mm_fmadd_round_ss(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fmadd_round_ss
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fmadd_ss
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fmadd_ss(__U, __A, __B, __C);
 }
 
+//
 __m128 test_mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fmadd_round_ss
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask3_fmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmadd_ss
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fmadd_ss(__W, __X, __Y, __U);
 }
 
+//
 __m128 test_mm_mask3_fmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmadd_round_ss
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fmsub_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_fmsub_round_ss(__m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_fmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
   return _mm_fmsub_round_ss(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_fmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fmsub_ss(__U, __A, __B, __C);
 }
 
+//
 __m128 test_mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask3_fmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fmsub_ss(__W, __X, __Y, __U);
 }
 
+//
 __m128 test_mm_mask3_fmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fnmadd_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fnmadd_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_fnmadd_round_ss(__m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_fnmadd_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
   return _mm_fnmadd_round_ss(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fnmadd_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fnmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmadd_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fnmadd_ss(__U, __A, __B, __C);
 }
 
+//
 __m128 test_mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmadd_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fnmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask3_fnmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmadd_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fnmadd_ss(__W, __X, __Y, __U);
 }
 
+//
 __m128 test_mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmadd_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fnmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fnmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fnmsub_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_fnmsub_round_ss(__m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_fnmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[FMA]], i64 0
   return _mm_fnmsub_round_ss(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
-  // CHECK-LABEL: @test_mm_mask_fnmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[A]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_mask_fnmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fnmsub_ss(__U, __A, __B, __C);
 }
 
+//
 __m128 test_mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <4 x float> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float 0.000000e+00
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGA]], float [[SEL]], i64 0
   return _mm_maskz_fnmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask3_fnmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmsub_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fnmsub_ss(__W, __X, __Y, __U);
 }
 
+//
 __m128 test_mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmsub_round_ss
-  // CHECK: [[NEG:%.+]] = fneg <4 x float> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <4 x float> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <4 x float> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <4 x float> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call float @llvm.x86.avx512.vfmadd.f32(float [[A]], float [[B]], float [[C]], i32 11)
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <4 x float> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, float [[FMA]], float [[C2]]
-  // CHECK-NEXT: insertelement <4 x float> [[ORIGC]], float [[SEL]], i64 0
   return _mm_mask3_fnmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fmadd_sd
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fmadd_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_fmadd_round_sd(__m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_fmadd_round_sd
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
   return _mm_fmadd_round_sd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fmadd_round_sd
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_fmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fmadd_sd
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fmadd_sd(__U, __A, __B, __C);
 }
 
+//
 __m128d test_mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fmadd_round_sd
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask3_fmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmadd_sd
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fmadd_sd(__W, __X, __Y, __U);
 }
 
+//
 __m128d test_mm_mask3_fmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmadd_round_sd
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fmsub_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_fmsub_round_sd(__m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_fmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
   return _mm_fmsub_round_sd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_fmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fmsub_sd(__U, __A, __B, __C);
 }
 
+//
 __m128d test_mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask3_fmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fmsub_sd(__W, __X, __Y, __U);
 }
 
+//
 __m128d test_mm_mask3_fmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fnmadd_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fnmadd_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_fnmadd_round_sd(__m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_fnmadd_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
   return _mm_fnmadd_round_sd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fnmadd_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fnmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmadd_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fnmadd_sd(__U, __A, __B, __C);
 }
 
+//
 __m128d test_mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmadd_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fnmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask3_fnmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmadd_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fnmadd_sd(__W, __X, __Y, __U);
 }
 
+//
 __m128d test_mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmadd_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fnmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fnmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fnmsub_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_fnmsub_round_sd(__m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_fnmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[FMA]], i64 0
   return _mm_fnmsub_round_sd(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
-  // CHECK-LABEL: @test_mm_mask_fnmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[A]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_mask_fnmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fnmsub_sd(__U, __A, __B, __C);
 }
 
+//
 __m128d test_mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
-  // CHECK-LABEL: @test_mm_maskz_fnmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[A:%.+]] = extractelement <2 x double> [[ORIGA:%.]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double 0.000000e+00
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGA]], double [[SEL]], i64 0
   return _mm_maskz_fnmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask3_fnmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmsub_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fnmsub_sd(__W, __X, __Y, __U);
 }
 
+//
 __m128d test_mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
-  // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sd
-  // CHECK: [[NEG:%.+]] = fneg <2 x double> %{{.*}}
-  // CHECK: [[NEG2:%.+]] = fneg <2 x double> [[ORIGC:%.+]]
-  // CHECK: [[A:%.+]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <2 x double> [[NEG]], i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <2 x double> [[NEG2]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call double @llvm.x86.avx512.vfmadd.f64(double [[A]], double [[B]], double [[C]], i32 11)
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <2 x double> [[ORIGC]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, double [[FMA]], double [[C2]]
-  // CHECK-NEXT: insertelement <2 x double> [[ORIGC]], double [[SEL]], i64 0
   return _mm_mask3_fnmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_permutex_pd(__m512d __X) {
-  // CHECK-LABEL: @test_mm512_permutex_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   return _mm512_permutex_pd(__X, 0);
 }
 
+//
 __m512d test_mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X) {
-  // CHECK-LABEL: @test_mm512_mask_permutex_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_permutex_pd(__W, __U, __X, 0);
 }
 
+//
 __m512d test_mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X) {
-  // CHECK-LABEL: @test_mm512_maskz_permutex_pd
-  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_permutex_pd(__U, __X, 0);
 }
 
+//
 __m512i test_mm512_permutex_epi64(__m512i __X) {
-  // CHECK-LABEL: @test_mm512_permutex_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   return _mm512_permutex_epi64(__X, 0);
 }
 
+//
 __m512i test_mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, __m512i __X) {
-  // CHECK-LABEL: @test_mm512_mask_permutex_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_permutex_epi64(__W, __M, __X, 0);
 }
 
+//
 __m512i test_mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X) {
-  // CHECK-LABEL: @test_mm512_maskz_permutex_epi64
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutex_epi64(__M, __X, 0);
 }
 
+//
 __m512d test_mm512_permutexvar_pd(__m512i __X, __m512d __Y) {
-  // CHECK-LABEL: @test_mm512_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.permvar.df.512
-  return _mm512_permutexvar_pd(__X, __Y); 
+  return _mm512_permutexvar_pd(__X, __Y);
 }
 
+//
 __m512d test_mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) {
-  // CHECK-LABEL: @test_mm512_mask_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.permvar.df.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_permutexvar_pd(__W, __U, __X, __Y); 
+  return _mm512_mask_permutexvar_pd(__W, __U, __X, __Y);
 }
 
+//
 __m512d test_mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_permutexvar_pd
-  // CHECK: @llvm.x86.avx512.permvar.df.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_permutexvar_pd(__U, __X, __Y); 
+  return _mm512_maskz_permutexvar_pd(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.permvar.di.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_maskz_permutexvar_epi64(__M, __X, __Y); 
+  return _mm512_maskz_permutexvar_epi64(__M, __X, __Y);
 }
 
+//
 __m512i test_mm512_permutexvar_epi64(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.permvar.di.512
-  return _mm512_permutexvar_epi64(__X, __Y); 
+  return _mm512_permutexvar_epi64(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi64
-  // CHECK: @llvm.x86.avx512.permvar.di.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
-  return _mm512_mask_permutexvar_epi64(__W, __M, __X, __Y); 
+  return _mm512_mask_permutexvar_epi64(__W, __M, __X, __Y);
 }
 
+//
 __m512 test_mm512_permutexvar_ps(__m512i __X, __m512 __Y) {
-  // CHECK-LABEL: @test_mm512_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.permvar.sf.512
-  return _mm512_permutexvar_ps(__X, __Y); 
+  return _mm512_permutexvar_ps(__X, __Y);
 }
 
+//
 __m512 test_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) {
-  // CHECK-LABEL: @test_mm512_mask_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.permvar.sf.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_permutexvar_ps(__W, __U, __X, __Y); 
+  return _mm512_mask_permutexvar_ps(__W, __U, __X, __Y);
 }
 
+//
 __m512 test_mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_permutexvar_ps
-  // CHECK: @llvm.x86.avx512.permvar.sf.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_permutexvar_ps(__U, __X, __Y); 
+  return _mm512_maskz_permutexvar_ps(__U, __X, __Y);
 }
 
+//
 __m512i test_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.permvar.si.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_permutexvar_epi32(__M, __X, __Y); 
+  return _mm512_maskz_permutexvar_epi32(__M, __X, __Y);
 }
 
+//
 __m512i test_mm512_permutexvar_epi32(__m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.permvar.si.512
-  return _mm512_permutexvar_epi32(__X, __Y); 
+  return _mm512_permutexvar_epi32(__X, __Y);
 }
 
+//
 __m512i test_mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) {
-  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi32
-  // CHECK: @llvm.x86.avx512.permvar.si.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_permutexvar_epi32(__W, __M, __X, __Y); 
+  return _mm512_mask_permutexvar_epi32(__W, __M, __X, __Y);
 }
 
+//
 __mmask16 test_mm512_kand(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kand
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = and <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kand(_mm512_cmpneq_epu32_mask(__A, __B),
                                                    _mm512_cmpneq_epu32_mask(__C, __D)),
                                                    __E, __F);
 }
 
+//
 __mmask16 test_mm512_kandn(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kandn
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[NOT:%.*]] = xor <16 x i1> [[LHS]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-  // CHECK: [[RES:%.*]] = and <16 x i1> [[NOT]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kandn(_mm512_cmpneq_epu32_mask(__A, __B),
                                                     _mm512_cmpneq_epu32_mask(__C, __D)),
                                                     __E, __F);
 }
 
+//
 __mmask16 test_mm512_kor(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kor
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kor(_mm512_cmpneq_epu32_mask(__A, __B),
                                                   _mm512_cmpneq_epu32_mask(__C, __D)),
                                                   __E, __F);
 }
 
+//
 int test_mm512_kortestc(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_mm512_kortestc
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[OR:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: [[CAST:%.*]] = bitcast <16 x i1> [[OR]] to i16
-  // CHECK: [[CMP:%.*]] = icmp eq i16 [[CAST]], -1
-  // CHECK: zext i1 [[CMP]] to i32
   return _mm512_kortestc(_mm512_cmpneq_epu32_mask(__A, __B),
                          _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+//
 int test_mm512_kortestz(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_mm512_kortestz
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[OR:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: [[CAST:%.*]] = bitcast <16 x i1> [[OR]] to i16
-  // CHECK: [[CMP:%.*]] = icmp eq i16 [[CAST]], 0
-  // CHECK: zext i1 [[CMP]] to i32
   return _mm512_kortestz(_mm512_cmpneq_epu32_mask(__A, __B),
                          _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+//
 unsigned char test_kortestz_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestz_mask16_u8
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[OR:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: [[CAST:%.*]] = bitcast <16 x i1> [[OR]] to i16
-  // CHECK: [[CMP:%.*]] = icmp eq i16 [[CAST]], 0
-  // CHECK: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-  // CHECK: trunc i32 [[ZEXT]] to i8
   return _kortestz_mask16_u8(_mm512_cmpneq_epu32_mask(__A, __B),
                              _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+//
 unsigned char test_kortestc_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D) {
-  // CHECK-LABEL: @test_kortestc_mask16_u8
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[OR:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: [[CAST:%.*]] = bitcast <16 x i1> [[OR]] to i16
-  // CHECK: [[CMP:%.*]] = icmp eq i16 [[CAST]], -1
-  // CHECK: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-  // CHECK: trunc i32 [[ZEXT]] to i8
   return _kortestc_mask16_u8(_mm512_cmpneq_epu32_mask(__A, __B),
                              _mm512_cmpneq_epu32_mask(__C, __D));
 }
 
+//
 unsigned char test_kortest_mask16_u8(__m512i __A, __m512i __B, __m512i __C, __m512i __D, unsigned char *CF) {
-  // CHECK-LABEL: @test_kortest_mask16_u8
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[OR:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: [[CAST:%.*]] = bitcast <16 x i1> [[OR]] to i16
-  // CHECK: [[CMP:%.*]] = icmp eq i16 [[CAST]], -1
-  // CHECK: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
-  // CHECK: trunc i32 [[ZEXT]] to i8
-  // CHECK: [[LHS2:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS2:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[OR2:%.*]] = or <16 x i1> [[LHS2]], [[RHS2]]
-  // CHECK: [[CAST2:%.*]] = bitcast <16 x i1> [[OR2]] to i16
-  // CHECK: [[CMP2:%.*]] = icmp eq i16 [[CAST2]], 0
-  // CHECK: [[ZEXT2:%.*]] = zext i1 [[CMP2]] to i32
-  // CHECK: trunc i32 [[ZEXT2]] to i8
   return _kortest_mask16_u8(_mm512_cmpneq_epu32_mask(__A, __B),
                             _mm512_cmpneq_epu32_mask(__C, __D), CF);
 }
 
+//
 __mmask16 test_mm512_kunpackb(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kunpackb
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[LHS2:%.*]] = shufflevector <16 x i1> [[LHS]], <16 x i1> [[LHS]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: [[RHS2:%.*]] = shufflevector <16 x i1> [[RHS]], <16 x i1> [[RHS]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: [[CONCAT:%.*]] = shufflevector <8 x i1> [[RHS2]], <8 x i1> [[LHS2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: bitcast <16 x i1> [[CONCAT]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kunpackb(_mm512_cmpneq_epu32_mask(__A, __B),
                                                        _mm512_cmpneq_epu32_mask(__C, __D)),
                                                        __E, __F);
 }
 
+//
 __mmask16 test_mm512_kxnor(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kxnor
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[NOT:%.*]] = xor <16 x i1> [[LHS]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-  // CHECK: [[RES:%.*]] = xor <16 x i1> [[NOT]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kxnor(_mm512_cmpneq_epu32_mask(__A, __B),
                                                     _mm512_cmpneq_epu32_mask(__C, __D)),
                                                     __E, __F);
 }
 
+//
 __mmask16 test_mm512_kxor(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_mm512_kxor
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = xor <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_mm512_kxor(_mm512_cmpneq_epu32_mask(__A, __B),
                                                    _mm512_cmpneq_epu32_mask(__C, __D)),
                                                    __E, __F);
 }
 
+//
 __mmask16 test_knot_mask16(__mmask16 a) {
-  // CHECK-LABEL: @test_knot_mask16
-  // CHECK: [[IN:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[NOT:%.*]] = xor <16 x i1> [[IN]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-  // CHECK: bitcast <16 x i1> [[NOT]] to i16
   return _knot_mask16(a);
 }
 
+//
 __mmask16 test_kand_mask16(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kand_mask16
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = and <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_kand_mask16(_mm512_cmpneq_epu32_mask(__A, __B),
                                                     _mm512_cmpneq_epu32_mask(__C, __D)),
                                                     __E, __F);
 }
 
+//
 __mmask16 test_kandn_mask16(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kandn_mask16
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[NOT:%.*]] = xor <16 x i1> [[LHS]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-  // CHECK: [[RES:%.*]] = and <16 x i1> [[NOT]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_kandn_mask16(_mm512_cmpneq_epu32_mask(__A, __B),
                                                      _mm512_cmpneq_epu32_mask(__C, __D)),
                                                      __E, __F);
 }
 
+//
 __mmask16 test_kor_mask16(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kor_mask16
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = or <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_kor_mask16(_mm512_cmpneq_epu32_mask(__A, __B),
                                                    _mm512_cmpneq_epu32_mask(__C, __D)),
                                                    __E, __F);
 }
 
+//
 __mmask16 test_kxnor_mask16(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxnor_mask16
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[NOT:%.*]] = xor <16 x i1> [[LHS]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-  // CHECK: [[RES:%.*]] = xor <16 x i1> [[NOT]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_kxnor_mask16(_mm512_cmpneq_epu32_mask(__A, __B),
                                                      _mm512_cmpneq_epu32_mask(__C, __D)),
                                                      __E, __F);
 }
 
+//
 __mmask16 test_kxor_mask16(__m512i __A, __m512i __B, __m512i __C, __m512i __D, __m512i __E, __m512i __F) {
-  // CHECK-LABEL: @test_kxor_mask16
-  // CHECK: [[LHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RHS:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = xor <16 x i1> [[LHS]], [[RHS]]
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_kxor_mask16(_mm512_cmpneq_epu32_mask(__A, __B),
                                                     _mm512_cmpneq_epu32_mask(__C, __D)),
                                                     __E, __F);
 }
 
+//
 __mmask16 test_kshiftli_mask16(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftli_mask16
-  // CHECK: [[VAL:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = shufflevector <16 x i1> zeroinitializer, <16 x i1> [[VAL]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_kshiftli_mask16(_mm512_cmpneq_epu32_mask(A, B), 1), C, D);
 }
 
+//
 __mmask16 test_kshiftri_mask16(__m512i A, __m512i B, __m512i C, __m512i D) {
-  // CHECK-LABEL: @test_kshiftri_mask16
-  // CHECK: [[VAL:%.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: [[RES:%.*]] = shufflevector <16 x i1> [[VAL]], <16 x i1> zeroinitializer, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
-  // CHECK: bitcast <16 x i1> [[RES]] to i16
   return _mm512_mask_cmpneq_epu32_mask(_kshiftri_mask16(_mm512_cmpneq_epu32_mask(A, B), 1), C, D);
 }
 
+//
 unsigned int test_cvtmask16_u32(__m512i A, __m512i B) {
-  // CHECK-LABEL: @test_cvtmask16_u32
-  // CHECK: bitcast <16 x i1> %{{.*}} to i16
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: zext i16 %{{.*}} to i32
   return _cvtmask16_u32(_mm512_cmpneq_epu32_mask(A, B));
 }
 
+//
 __mmask16 test_cvtu32_mask16(__m512i A, __m512i B, unsigned int C) {
-  // CHECK-LABEL: @test_cvtu32_mask16
-  // CHECK: trunc i32 %{{.*}} to i16
-  // CHECK: bitcast i16 %{{.*}} to <16 x i1>
   return _mm512_mask_cmpneq_epu32_mask(_cvtu32_mask16(C), A, B);
 }
 
+//
 __mmask16 test_load_mask16(__mmask16 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_load_mask16
-  // CHECK: [[LOAD:%.*]] = load i16, ptr %{{.*}}{{$}}
-  // CHECK: bitcast i16 [[LOAD]] to <16 x i1>
   return _mm512_mask_cmpneq_epu32_mask(_load_mask16(A), B, C);
 }
 
+//
 void test_store_mask16(__mmask16 *A, __m512i B, __m512i C) {
-  // CHECK-LABEL: @test_store_mask16
-  // CHECK: bitcast <16 x i1> %{{.*}} to i16
-  // CHECK: store i16 %{{.*}}, ptr %{{.*}}
   _store_mask16(A, _mm512_cmpneq_epu32_mask(B, C));
 }
 
+//
 void test_mm512_stream_si512(__m512i * __P, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_stream_si512
-  // CHECK: store <8 x i64> %{{.*}}, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL:![0-9]+]]
-  _mm512_stream_si512(__P, __A); 
+  _mm512_stream_si512(__P, __A);
 }
 
+//
 void test_mm512_stream_si512_2(void * __P, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_stream_si512
-  // CHECK: store <8 x i64> %{{.*}}, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL]]
-  _mm512_stream_si512(__P, __A); 
+  _mm512_stream_si512(__P, __A);
 }
 
+//
 __m512i test_mm512_stream_load_si512(void *__P) {
-  // CHECK-LABEL: @test_mm512_stream_load_si512
-  // CHECK: load <8 x i64>, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL]]{{$}}
-  return _mm512_stream_load_si512(__P); 
+  return _mm512_stream_load_si512(__P);
 }
 
+//
 __m512i test_mm512_stream_load_si512_const(void const *__P) {
-  // CHECK-LABEL: @test_mm512_stream_load_si512_const
-  // CHECK: load <8 x i64>, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL]]{{$}}
-  return _mm512_stream_load_si512(__P); 
+  return _mm512_stream_load_si512(__P);
 }
 
+//
 void test_mm512_stream_pd(double *__P, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_stream_pd
-  // CHECK: store <8 x double> %{{.*}}, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL]]
-  return _mm512_stream_pd(__P, __A); 
+  return _mm512_stream_pd(__P, __A);
 }
 
+//
 void test_mm512_stream_pd_2(void *__P, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_stream_pd
-  // CHECK: store <8 x double> %{{.*}}, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL]]
-  return _mm512_stream_pd(__P, __A); 
+  return _mm512_stream_pd(__P, __A);
 }
 
+//
 void test_mm512_stream_ps(float *__P, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_stream_ps
-  // CHECK: store <16 x float> %{{.*}}, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL]]
-  _mm512_stream_ps(__P, __A); 
+  _mm512_stream_ps(__P, __A);
 }
 
+//
 void test_mm512_stream_ps_2(void *__P, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_stream_ps
-  // CHECK: store <16 x float> %{{.*}}, ptr %{{.*}}, align 64, !nontemporal [[NONTEMPORAL]]
-  _mm512_stream_ps(__P, __A); 
+  _mm512_stream_ps(__P, __A);
 }
+//
 __m512d test_mm512_mask_compress_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_compress_pd
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_mask_compress_pd(__W, __U, __A); 
+  return _mm512_mask_compress_pd(__W, __U, __A);
 }
 
+//
 __m512d test_mm512_maskz_compress_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_compress_pd
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_maskz_compress_pd(__U, __A); 
+  return _mm512_maskz_compress_pd(__U, __A);
 }
 
+//
 __m512i test_mm512_mask_compress_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_compress_epi64
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_mask_compress_epi64(__W, __U, __A); 
+  return _mm512_mask_compress_epi64(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_compress_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_compress_epi64
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_maskz_compress_epi64(__U, __A); 
+  return _mm512_maskz_compress_epi64(__U, __A);
 }
 
+//
 __m512 test_mm512_mask_compress_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_compress_ps
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_mask_compress_ps(__W, __U, __A); 
+  return _mm512_mask_compress_ps(__W, __U, __A);
 }
 
+//
 __m512 test_mm512_maskz_compress_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_compress_ps
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_maskz_compress_ps(__U, __A); 
+  return _mm512_maskz_compress_ps(__U, __A);
 }
 
+//
 __m512i test_mm512_mask_compress_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_compress_epi32
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_mask_compress_epi32(__W, __U, __A); 
+  return _mm512_mask_compress_epi32(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_compress_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_compress_epi32
-  // CHECK: @llvm.x86.avx512.mask.compress
-  return _mm512_maskz_compress_epi32(__U, __A); 
+  return _mm512_maskz_compress_epi32(__U, __A);
 }
 
+//
 __mmask8 test_mm_cmp_round_ss_mask(__m128 __X, __m128 __Y) {
-  // CHECK-LABEL: @test_mm_cmp_round_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_round_ss_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask8 test_mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
-  // CHECK-LABEL: @test_mm_mask_cmp_round_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_round_ss_mask(__M, __X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask8 test_mm_cmp_ss_mask(__m128 __X, __m128 __Y) {
-  // CHECK-LABEL: @test_mm_cmp_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_ss_mask(__X, __Y, _CMP_NLT_US);
 }
 
+//
 __mmask8 test_mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
-  // CHECK-LABEL: @test_mm_mask_cmp_ss_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_ss_mask(__M, __X, __Y, _CMP_NLT_US);
 }
 
+//
 __mmask8 test_mm_cmp_round_sd_mask(__m128d __X, __m128d __Y) {
-  // CHECK-LABEL: @test_mm_cmp_round_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_round_sd_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask8 test_mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
-  // CHECK-LABEL: @test_mm_mask_cmp_round_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_round_sd_mask(__M, __X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
+//
 __mmask8 test_mm_cmp_sd_mask(__m128d __X, __m128d __Y) {
-  // CHECK-LABEL: @test_mm_cmp_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_cmp_sd_mask(__X, __Y, _CMP_NLT_US);
 }
 
+//
 __mmask8 test_mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
-  // CHECK-LABEL: @test_mm_mask_cmp_sd_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp
   return _mm_mask_cmp_sd_mask(__M, __X, __Y, _CMP_NLT_US);
 }
 
+//
 __m512 test_mm512_movehdup_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_movehdup_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
   return _mm512_movehdup_ps(__A);
 }
 
+//
 __m512 test_mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_movehdup_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_movehdup_ps(__W, __U, __A);
 }
 
+//
 __m512 test_mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_movehdup_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_movehdup_ps(__U, __A);
 }
 
+//
 __m512 test_mm512_moveldup_ps(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_moveldup_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
   return _mm512_moveldup_ps(__A);
 }
 
+//
 __m512 test_mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_moveldup_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_moveldup_ps(__W, __U, __A);
 }
 
+//
 __m512 test_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_moveldup_ps
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_moveldup_ps(__U, __A);
 }
 
+//
 __m512i test_mm512_shuffle_epi32(__m512i __A) {
-  // CHECK-LABEL: @test_mm512_shuffle_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
-  return _mm512_shuffle_epi32(__A, 1); 
+  return _mm512_shuffle_epi32(__A, 1);
 }
 
+//
 __m512i test_mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_shuffle_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_mask_shuffle_epi32(__W, __U, __A, 1); 
+  return _mm512_mask_shuffle_epi32(__W, __U, __A, 1);
 }
 
+//
 __m512i test_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_shuffle_epi32
-  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_shuffle_epi32(__U, __A, 1); 
+  return _mm512_maskz_shuffle_epi32(__U, __A, 1);
 }
 
+//
 __m512d test_mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_expand_pd
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_mask_expand_pd(__W, __U, __A); 
+  return _mm512_mask_expand_pd(__W, __U, __A);
 }
 
+//
 __m512d test_mm512_maskz_expand_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_expand_pd
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_maskz_expand_pd(__U, __A); 
+  return _mm512_maskz_expand_pd(__U, __A);
 }
 
+//
 __m512i test_mm512_mask_expand_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_expand_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_mask_expand_epi64(__W, __U, __A); 
+  return _mm512_mask_expand_epi64(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_expand_epi64
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_maskz_expand_epi64(__U, __A); 
+  return _mm512_maskz_expand_epi64(__U, __A);
 }
+//
 __m512i test_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v8i64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
-  return _mm512_mask_expandloadu_epi64(__W, __U, __P); 
+  return _mm512_mask_expandloadu_epi64(__W, __U, __P);
 }
 
+//
 __m512i test_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi64
-  // CHECK: @llvm.masked.expandload.v8i64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
-  return _mm512_maskz_expandloadu_epi64(__U, __P); 
+  return _mm512_maskz_expandloadu_epi64(__U, __P);
 }
 
+//
 __m512d test_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v8f64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
-  return _mm512_mask_expandloadu_pd(__W, __U, __P); 
+  return _mm512_mask_expandloadu_pd(__W, __U, __P);
 }
 
+//
 __m512d test_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_expandloadu_pd
-  // CHECK: @llvm.masked.expandload.v8f64(ptr %{{.*}}, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
-  return _mm512_maskz_expandloadu_pd(__U, __P); 
+  return _mm512_maskz_expandloadu_pd(__U, __P);
 }
 
+//
 __m512i test_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v16i32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
-  return _mm512_mask_expandloadu_epi32(__W, __U, __P); 
+  return _mm512_mask_expandloadu_epi32(__W, __U, __P);
 }
 
+//
 __m512i test_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi32
-  // CHECK: @llvm.masked.expandload.v16i32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
-  return _mm512_maskz_expandloadu_epi32(__U, __P); 
+  return _mm512_maskz_expandloadu_epi32(__U, __P);
 }
 
+//
 __m512 test_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_mask_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v16f32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
-  return _mm512_mask_expandloadu_ps(__W, __U, __P); 
+  return _mm512_mask_expandloadu_ps(__W, __U, __P);
 }
 
+//
 __m512 test_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) {
-  // CHECK-LABEL: @test_mm512_maskz_expandloadu_ps
-  // CHECK: @llvm.masked.expandload.v16f32(ptr %{{.*}}, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
-  return _mm512_maskz_expandloadu_ps(__U, __P); 
+  return _mm512_maskz_expandloadu_ps(__U, __P);
 }
 
+//
 __m512 test_mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_expand_ps
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_mask_expand_ps(__W, __U, __A); 
+  return _mm512_mask_expand_ps(__W, __U, __A);
 }
 
+//
 __m512 test_mm512_maskz_expand_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_expand_ps
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_maskz_expand_ps(__U, __A); 
+  return _mm512_maskz_expand_ps(__U, __A);
 }
 
+//
 __m512i test_mm512_mask_expand_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_expand_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_mask_expand_epi32(__W, __U, __A); 
+  return _mm512_mask_expand_epi32(__W, __U, __A);
 }
 
+//
 __m512i test_mm512_maskz_expand_epi32(__mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_maskz_expand_epi32
-  // CHECK: @llvm.x86.avx512.mask.expand
-  return _mm512_maskz_expand_epi32(__U, __A); 
+  return _mm512_maskz_expand_epi32(__U, __A);
 }
+//
 __m512d test_mm512_cvt_roundps_pd(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvt_roundps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
   return _mm512_cvt_roundps_pd(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_mask_cvt_roundps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
   return _mm512_mask_cvt_roundps_pd(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_pd
-  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
   return _mm512_maskz_cvt_roundps_pd(__U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_cvtps_pd(__m256 __A) {
-  // CHECK-LABEL: @test_mm512_cvtps_pd
-  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
-  return _mm512_cvtps_pd(__A); 
+  return _mm512_cvtps_pd(__A);
 }
 
+//
 __m512d test_mm512_cvtpslo_pd(__m512 __A) {
-  // CHECK-LABEL: @test_mm512_cvtpslo_pd
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
   return _mm512_cvtpslo_pd(__A);
 }
 
+//
 __m512d test_mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtps_pd
-  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_cvtps_pd(__W, __U, __A); 
+  return _mm512_mask_cvtps_pd(__W, __U, __A);
 }
 
+//
 __m512d test_mm512_mask_cvtpslo_pd(__m512d __W, __mmask8 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtpslo_pd
-  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_cvtpslo_pd(__W, __U, __A);
 }
 
+//
 __m512d test_mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtps_pd
-  // CHECK: fpext <8 x float> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_cvtps_pd(__U, __A); 
+  return _mm512_maskz_cvtps_pd(__U, __A);
 }
+//
 __m512d test_mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_mov_pd
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_mask_mov_pd(__W, __U, __A); 
+  return _mm512_mask_mov_pd(__W, __U, __A);
 }
 
+//
 __m512d test_mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_mov_pd
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
-  return _mm512_maskz_mov_pd(__U, __A); 
+  return _mm512_maskz_mov_pd(__U, __A);
 }
 
+//
 __m512 test_mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_mov_ps
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_mask_mov_ps(__W, __U, __A); 
+  return _mm512_mask_mov_ps(__W, __U, __A);
 }
 
+//
 __m512 test_mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_maskz_mov_ps
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_mov_ps(__U, __A); 
+  return _mm512_maskz_mov_ps(__U, __A);
 }
 
+//
 void test_mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_compressstoreu_pd
-  // CHECK: @llvm.masked.compressstore.v8f64(<8 x double> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
-  return _mm512_mask_compressstoreu_pd(__P, __U, __A); 
+  return _mm512_mask_compressstoreu_pd(__P, __U, __A);
 }
 
+//
 void test_mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi64
-  // CHECK: @llvm.masked.compressstore.v8i64(<8 x i64> %{{.*}}, ptr %{{.*}}, <8 x i1> %{{.*}})
-  return _mm512_mask_compressstoreu_epi64(__P, __U, __A); 
+  return _mm512_mask_compressstoreu_epi64(__P, __U, __A);
 }
 
+//
 void test_mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, __m512 __A) {
-  // CHECK-LABEL: @test_mm512_mask_compressstoreu_ps
-  // CHECK: @llvm.masked.compressstore.v16f32(<16 x float> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
-  return _mm512_mask_compressstoreu_ps(__P, __U, __A); 
+  return _mm512_mask_compressstoreu_ps(__P, __U, __A);
 }
 
+//
 void test_mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, __m512i __A) {
-  // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi32
-  // CHECK: @llvm.masked.compressstore.v16i32(<16 x i32> %{{.*}}, ptr %{{.*}}, <16 x i1> %{{.*}})
-  return _mm512_mask_compressstoreu_epi32(__P, __U, __A); 
+  return _mm512_mask_compressstoreu_epi32(__P, __U, __A);
 }
 
+//
 __m256i test_mm512_cvtt_roundpd_epu32(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
   return _mm512_cvtt_roundpd_epu32(__A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_mask_cvtt_roundpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
   return _mm512_mask_cvtt_roundpd_epu32(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_maskz_cvtt_roundpd_epu32(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
   return _mm512_maskz_cvtt_roundpd_epu32(__U, __A, _MM_FROUND_NO_EXC);
 }
 
+//
 __m256i test_mm512_cvttpd_epu32(__m512d __A) {
-  // CHECK-LABEL: @test_mm512_cvttpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
-  return _mm512_cvttpd_epu32(__A); 
+  return _mm512_cvttpd_epu32(__A);
 }
 
+//
 __m256i test_mm512_mask_cvttpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvttpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
-  return _mm512_mask_cvttpd_epu32(__W, __U, __A); 
+  return _mm512_mask_cvttpd_epu32(__W, __U, __A);
 }
 
+//
 __m256i test_mm512_maskz_cvttpd_epu32(__mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
-  return _mm512_maskz_cvttpd_epu32(__U, __A); 
+  return _mm512_maskz_cvttpd_epu32(__U, __A);
 }
 
+//
 __m512 test_mm512_castpd_ps (__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_castpd_ps 
-  // CHECK: bitcast <8 x double> %{{.}} to <16 x float>
   return _mm512_castpd_ps (__A);
 }
 
+//
 __m512d test_mm512_castps_pd (__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_castps_pd 
-  // CHECK: bitcast <16 x float> %{{.}} to <8 x double>
   return _mm512_castps_pd (__A);
 }
 
+//
 __m512i test_mm512_castpd_si512 (__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_castpd_si512 
-  // CHECK: bitcast <8 x double> %{{.}} to <8 x i64>
   return _mm512_castpd_si512 (__A);
 }
 
+//
 __m512 test_mm512_castps128_ps512(__m128 __A) {
-  // CHECK-LABEL: @test_mm512_castps128_ps512
-  // CHECK: [[B:%.*]] = freeze <8 x float> poison
-  // CHECK: store <8 x float> [[B]], ptr [[BA:%.*]]
-  // CHECK: [[A:%.*]] = freeze <4 x float> poison 
-  // CHECK: [[SV:%.*]] = shufflevector <4 x float> %{{.*}}, <4 x float> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  // CHECK: [[C:%.*]] = load <8 x float>, ptr [[BA]]
-  // CHECK: shufflevector <8 x float> [[SV]], <8 x float> [[C]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  return _mm512_castps128_ps512(__A); 
+  return _mm512_castps128_ps512(__A);
 }
 
+//
 __m512d test_mm512_castpd128_pd512(__m128d __A) {
-  // CHECK-LABEL: @test_mm512_castpd128_pd512
-  // CHECK: [[B:%.*]] = freeze <4 x double> poison
-  // CHECK: store <4 x double> [[B]], ptr [[BA:%.*]]
-  // CHECK: [[A:%.*]] = freeze <2 x double> poison 
-  // CHECK: [[SV:%.*]] = shufflevector <2 x double> %{{.*}}, <2 x double> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: [[C:%.*]] = load <4 x double>, ptr [[BA]]
-  // CHECK: shufflevector <4 x double> [[SV]], <4 x double> [[C]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  return _mm512_castpd128_pd512(__A); 
+  return _mm512_castpd128_pd512(__A);
 }
 
+//
 __m512i test_mm512_set1_epi8(char d)
 {
-  // CHECK-LABEL: @test_mm512_set1_epi8
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 0
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 1
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 2
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 3
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 4
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 5
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 6
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 7
-  // CHECK: insertelement <64 x i8> {{.*}}, i32 63
   return _mm512_set1_epi8(d);
 }
 
+//
 __m512i test_mm512_set1_epi16(short d)
 {
-  // CHECK-LABEL: @test_mm512_set1_epi16
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 0
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 1
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 2
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 3
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 4
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 5
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 6
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 7
-  // CHECK: insertelement <32 x i16> {{.*}}, i32 31
   return _mm512_set1_epi16(d);
 }
 
+//
 __m512i test_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
 {
-  // CHECK-LABEL: @test_mm512_set4_epi32 
-  // CHECK: insertelement <16 x i32> {{.*}}, i32 15
   return _mm512_set4_epi32 (__A,__B,__C,__D);
 }
 
+//
 __m512i test_mm512_set4_epi64 (long long __A, long long __B, long long __C, long long __D)
 {
-  // CHECK-LABEL: @test_mm512_set4_epi64 
-  // CHECK: insertelement <8 x i64> {{.*}}, i32 7
   return _mm512_set4_epi64 (__A,__B,__C,__D);
 }
 
+//
 __m512d test_mm512_set4_pd (double __A, double __B, double __C, double __D)
 {
-  // CHECK-LABEL: @test_mm512_set4_pd 
-  // CHECK: insertelement <8 x double> {{.*}}, i32 7
   return _mm512_set4_pd (__A,__B,__C,__D);
 }
 
+//
 __m512 test_mm512_set4_ps (float __A, float __B, float __C, float __D)
 {
-  // CHECK-LABEL: @test_mm512_set4_ps 
-  // CHECK: insertelement <16 x float> {{.*}}, i32 15
   return _mm512_set4_ps (__A,__B,__C,__D);
 }
 
+//
 __m512i test_mm512_setr4_epi32(int e0, int e1, int e2, int e3)
 {
-  // CHECK-LABEL: @test_mm512_setr4_epi32
-  // CHECK: insertelement <16 x i32> {{.*}}, i32 15
   return _mm512_setr4_epi32(e0, e1, e2, e3);
 }
 
+//
  __m512i test_mm512_setr4_epi64(long long e0, long long e1, long long e2, long long e3)
 {
-  // CHECK-LABEL: @test_mm512_setr4_epi64
-  // CHECK: insertelement <8 x i64> {{.*}}, i32 7
   return _mm512_setr4_epi64(e0, e1, e2, e3);
 }
 
+//
 __m512d test_mm512_setr4_pd(double e0, double e1, double e2, double e3)
 {
-  // CHECK-LABEL: @test_mm512_setr4_pd
-  // CHECK: insertelement <8 x double> {{.*}}, i32 7
   return _mm512_setr4_pd(e0,e1,e2,e3);
 }
 
+//
  __m512 test_mm512_setr4_ps(float e0, float e1, float e2, float e3)
 {
-  // CHECK-LABEL: @test_mm512_setr4_ps
-  // CHECK: insertelement <16 x float> {{.*}}, i32 15
   return _mm512_setr4_ps(e0,e1,e2,e3);
 }
 
+//
 __m512d test_mm512_castpd256_pd512(__m256d a)
 {
-  // CHECK-LABEL: @test_mm512_castpd256_pd512
-  // CHECK: [[A:%.*]] = freeze <4 x double> poison 
-  // CHECK: shufflevector <4 x double> %{{.}}, <4 x double> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_castpd256_pd512(a);
 }
 
+//
 __m256d test_mm512_castpd512_pd256 (__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_castpd512_pd256 
-  // CHECK: shufflevector <8 x double> %{{.}}, <8 x double> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm512_castpd512_pd256 (__A);
 }
 
+//
 __m256 test_mm512_castps512_ps256 (__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_castps512_ps256 
-  // CHECK: shufflevector <16 x float> %{{.}}, <16 x float> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_castps512_ps256 (__A);
 }
 
+//
 __m512i test_mm512_castps_si512 (__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_castps_si512 
-  // CHECK: bitcast <16 x float> %{{.}} to <8 x i64>
   return _mm512_castps_si512 (__A);
 }
+//
 __m512i test_mm512_castsi128_si512(__m128i __A) {
-  // CHECK-LABEL: @test_mm512_castsi128_si512
-  // CHECK: [[B:%.*]] = freeze <4 x i64> poison
-  // CHECK: store <4 x i64> [[B]], ptr [[BA:%.*]]
-  // CHECK: [[A:%.*]] = freeze <2 x i64> poison 
-  // CHECK: [[SV:%.*]] = shufflevector <2 x i64> %{{.*}}, <2 x i64> [[A]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: [[C:%.*]] = load <4 x i64>, ptr [[BA]]
-  // CHECK: shufflevector <4 x i64> [[SV]], <4 x i64> [[C]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  return _mm512_castsi128_si512(__A); 
+  return _mm512_castsi128_si512(__A);
 }
 
+//
 __m512i test_mm512_castsi256_si512(__m256i __A) {
-  // CHECK-LABEL: @test_mm512_castsi256_si512
-  // CHECK: [[A:%.*]] = freeze <4 x i64> poison 
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> [[A]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  return _mm512_castsi256_si512(__A); 
+  return _mm512_castsi256_si512(__A);
 }
 
+//
 __m512 test_mm512_castsi512_ps (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_castsi512_ps 
-  // CHECK: bitcast <8 x i64> %{{.}} to <16 x float>
   return _mm512_castsi512_ps (__A);
 }
 
+//
 __m512d test_mm512_castsi512_pd (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_castsi512_pd 
-  // CHECK: bitcast <8 x i64> %{{.}} to <8 x double>
   return _mm512_castsi512_pd (__A);
 }
 
+//
 __m128i test_mm512_castsi512_si128 (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_castsi512_si128 
-  // CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <2 x i32> <i32 0, i32 1>
   return _mm512_castsi512_si128 (__A);
 }
 
+//
 __m256i test_mm512_castsi512_si256 (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_castsi512_si256 
-  // CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return _mm512_castsi512_si256 (__A);
 }
 
+//
 __m128 test_mm_cvt_roundsd_ss(__m128 __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundsd_ss
-  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
   return _mm_cvt_roundsd_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_mask_cvt_roundsd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_cvt_roundsd_ss
-  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
   return _mm_mask_cvt_roundsd_ss(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_maskz_cvt_roundsd_ss(__mmask8 __U, __m128 __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_cvt_roundsd_ss
-  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
   return _mm_maskz_cvt_roundsd_ss(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
 #ifdef __x86_64__
+//
 __m128d test_mm_cvt_roundi64_sd(__m128d __A, long long __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundi64_sd
-  // CHECK: @llvm.x86.avx512.cvtsi2sd64
   return _mm_cvt_roundi64_sd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_cvt_roundsi64_sd(__m128d __A, long long __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundsi64_sd
-  // CHECK: @llvm.x86.avx512.cvtsi2sd64
   return _mm_cvt_roundsi64_sd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 #endif
 
+//
 __m128 test_mm_cvt_roundsi32_ss(__m128 __A, int __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundsi32_ss
-  // CHECK: @llvm.x86.avx512.cvtsi2ss32
   return _mm_cvt_roundsi32_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_cvt_roundi32_ss(__m128 __A, int __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundi32_ss
-  // CHECK: @llvm.x86.avx512.cvtsi2ss32
   return _mm_cvt_roundi32_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
 #ifdef __x86_64__
+//
 __m128 test_mm_cvt_roundsi64_ss(__m128 __A, long long __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundsi64_ss
-  // CHECK: @llvm.x86.avx512.cvtsi2ss64
   return _mm_cvt_roundsi64_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_cvt_roundi64_ss(__m128 __A, long long __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundi64_ss
-  // CHECK: @llvm.x86.avx512.cvtsi2ss64
   return _mm_cvt_roundi64_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 #endif
 
+//
 __m128d test_mm_cvt_roundss_sd(__m128d __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundss_sd
-  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
   return _mm_cvt_roundss_sd(__A, __B, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_mask_cvt_roundss_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_cvt_roundss_sd
-  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
   return _mm_mask_cvt_roundss_sd(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_maskz_cvt_roundss_sd( __mmask8 __U, __m128d __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_cvt_roundss_sd
-  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
   return _mm_maskz_cvt_roundss_sd( __U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_cvtu32_sd(__m128d __A, unsigned __B) {
-  // CHECK-LABEL: @test_mm_cvtu32_sd
-  // CHECK: uitofp i32 %{{.*}} to double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
-  return _mm_cvtu32_sd(__A, __B); 
+  return _mm_cvtu32_sd(__A, __B);
 }
 
 #ifdef __x86_64__
+//
 __m128d test_mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundu64_sd
-  // CHECK: @llvm.x86.avx512.cvtusi642sd
   return _mm_cvt_roundu64_sd(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128d test_mm_cvtu64_sd(__m128d __A, unsigned long long __B) {
-  // CHECK-LABEL: @test_mm_cvtu64_sd
-  // CHECK: uitofp i64 %{{.*}} to double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
-  return _mm_cvtu64_sd(__A, __B); 
+  return _mm_cvtu64_sd(__A, __B);
 }
 #endif
 
+//
 __m128 test_mm_cvt_roundu32_ss(__m128 __A, unsigned __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundu32_ss
-  // CHECK: @llvm.x86.avx512.cvtusi2ss
   return _mm_cvt_roundu32_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_cvtu32_ss(__m128 __A, unsigned __B) {
-  // CHECK-LABEL: @test_mm_cvtu32_ss
-  // CHECK: uitofp i32 %{{.*}} to float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
-  return _mm_cvtu32_ss(__A, __B); 
+  return _mm_cvtu32_ss(__A, __B);
 }
 
 #ifdef __x86_64__
+//
 __m128 test_mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B) {
-  // CHECK-LABEL: @test_mm_cvt_roundu64_ss
-  // CHECK: @llvm.x86.avx512.cvtusi642ss
     return _mm_cvt_roundu64_ss(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+//
 __m128 test_mm_cvtu64_ss(__m128 __A, unsigned long long __B) {
-  // CHECK-LABEL: @test_mm_cvtu64_ss
-  // CHECK: uitofp i64 %{{.*}} to float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
-  return _mm_cvtu64_ss(__A, __B); 
+  return _mm_cvtu64_ss(__A, __B);
 }
 #endif
 
+//
 __m512i test_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvttps_epu32 
-  // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
   return _mm512_mask_cvttps_epu32 (__W,__U,__A);
 }
 
+//
 __m512i test_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvttps_epu32 
-  // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
   return _mm512_maskz_cvttps_epu32 (__U,__A);
 }
 
+//
 __m512 test_mm512_cvtepu32_ps (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtepu32_ps 
-  // CHECK: uitofp <16 x i32> %{{.*}} to <16 x float>
   return _mm512_cvtepu32_ps (__A);
 }
 
+//
 __m512 test_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu32_ps 
-  // CHECK: uitofp <16 x i32> %{{.*}} to <16 x float>
-  // CHECK: select <16 x i1> {{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
   return _mm512_mask_cvtepu32_ps (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_ps 
-  // CHECK: uitofp <16 x i32> %{{.*}} to <16 x float>
-  // CHECK: select <16 x i1> {{.*}}, <16 x float> {{.*}}, <16 x float> {{.*}}
   return _mm512_maskz_cvtepu32_ps (__U,__A);
 }
 
+//
 __m512d test_mm512_cvtepi32_pd (__m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtepi32_pd
-  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
   return _mm512_cvtepi32_pd (__A);
 }
 
+//
 __m512d test_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_pd
-  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_mask_cvtepi32_pd (__W,__U,__A);
 }
 
+//
 __m512d test_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_pd
-  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_maskz_cvtepi32_pd (__U,__A);
 }
 
+//
 __m512d test_mm512_cvtepi32lo_pd (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtepi32lo_pd
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
   return _mm512_cvtepi32lo_pd (__A);
 }
 
+//
 __m512d test_mm512_mask_cvtepi32lo_pd (__m512d __W, __mmask8 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32lo_pd
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: sitofp <8 x i32> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_mask_cvtepi32lo_pd (__W, __U, __A);
 }
 
+//
 __m512 test_mm512_cvtepi32_ps (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtepi32_ps 
-  // CHECK: sitofp <16 x i32> %{{.*}} to <16 x float>
   return _mm512_cvtepi32_ps (__A);
 }
 
+//
 __m512 test_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepi32_ps 
-  // CHECK: sitofp <16 x i32> %{{.*}} to <16 x float>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvtepi32_ps (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_ps 
-  // CHECK: sitofp <16 x i32> %{{.*}} to <16 x float>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvtepi32_ps (__U,__A);
 }
 
+//
 __m512d test_mm512_cvtepu32_pd(__m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtepu32_pd
-  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
   return _mm512_cvtepu32_pd(__A);
 }
 
+//
 __m512d test_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu32_pd
-  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_mask_cvtepu32_pd (__W,__U,__A);
 }
 
+//
 __m512d test_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_pd
-  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_maskz_cvtepu32_pd (__U,__A);
 }
 
+//
 __m512d test_mm512_cvtepu32lo_pd (__m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtepu32lo_pd
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
   return _mm512_cvtepu32lo_pd (__A);
 }
 
+//
 __m512d test_mm512_mask_cvtepu32lo_pd (__m512d __W, __mmask8 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtepu32lo_pd
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK: uitofp <8 x i32> %{{.*}} to <8 x double>
-  // CHECK: select <8 x i1> {{.*}}, <8 x double> {{.*}}, <8 x double> {{.*}}
   return _mm512_mask_cvtepu32lo_pd (__W, __U, __A);
 }
 
+//
 __m256 test_mm512_cvtpd_ps (__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtpd_ps 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
   return _mm512_cvtpd_ps (__A);
 }
 
+//
 __m256 test_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtpd_ps 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
   return _mm512_mask_cvtpd_ps (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_cvtpd_pslo(__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtpd_pslo
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
-  // CHECK: zeroinitializer
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_cvtpd_pslo(__A);
 }
 
+//
 __m512 test_mm512_mask_cvtpd_pslo(__m512 __W, __mmask8 __U, __m512d __A) {
-  // CHECK-LABEL: @test_mm512_mask_cvtpd_pslo
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
-  // CHECK: zeroinitializer
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_mask_cvtpd_pslo(__W, __U, __A);
 }
 
+//
 __m256 test_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtpd_ps 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
   return _mm512_maskz_cvtpd_ps (__U,__A);
 }
 
+//
 __m512 test_mm512_cvtph_ps (__m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtph_ps 
-  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16>
-  // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half>
-  // CHECK: fpext <16 x half> %{{.*}} to <16 x float>
   return _mm512_cvtph_ps (__A);
 }
 
+//
 __m512 test_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtph_ps 
-  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16>
-  // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half>
-  // CHECK: fpext <16 x half> %{{.*}} to <16 x float>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_cvtph_ps (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtph_ps 
-  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x i16>
-  // CHECK: bitcast <16 x i16> %{{.*}} to <16 x half>
-  // CHECK: fpext <16 x half> %{{.*}} to <16 x float>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_cvtph_ps (__U,__A);
 }
 
+//
 __m256i test_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvttpd_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
   return _mm512_mask_cvttpd_epi32 (__W,__U,__A);
 }
 
+//
 __m256i test_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
   return _mm512_maskz_cvttpd_epi32 (__U,__A);
 }
 
+//
 __m512i test_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvttps_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
   return _mm512_mask_cvttps_epi32 (__W,__U,__A);
 }
 
+//
 __m512i test_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvttps_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
   return _mm512_maskz_cvttps_epi32 (__U,__A);
 }
 
+//
 __m512i test_mm512_cvtps_epi32 (__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtps_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
   return _mm512_cvtps_epi32 (__A);
 }
 
+//
 __m512i test_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtps_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
   return _mm512_mask_cvtps_epi32 (__W,__U,__A);
 }
 
+//
 __m512i test_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtps_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
   return _mm512_maskz_cvtps_epi32 (__U,__A);
 }
 
+//
 __m256i test_mm512_cvtpd_epi32 (__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtpd_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
   return _mm512_cvtpd_epi32 (__A);
 }
 
+//
 __m256i test_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtpd_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
   return _mm512_mask_cvtpd_epi32 (__W,__U,__A);
 }
 
+//
 __m256i test_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epi32 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
   return _mm512_maskz_cvtpd_epi32 (__U,__A);
 }
 
+//
 __m256i test_mm512_cvtpd_epu32 (__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtpd_epu32 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
   return _mm512_cvtpd_epu32 (__A);
 }
 
+//
 __m256i test_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtpd_epu32 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
   return _mm512_mask_cvtpd_epu32 (__W,__U,__A);
 }
 
+//
 __m256i test_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epu32 
-  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
   return _mm512_maskz_cvtpd_epu32 (__U,__A);
 }
 
-__m256i test_mm512_mask_cvtps_ph(__m256i src, __mmask16 k, __m512 a) 
+//
+__m256i test_mm512_mask_cvtps_ph(__m256i src, __mmask16 k, __m512 a)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtps_ph
-  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
   return _mm512_mask_cvtps_ph(src, k, a,_MM_FROUND_TO_ZERO);
 }
 
-__m256i test_mm512_maskz_cvtps_ph (__mmask16 k, __m512 a) 
+//
+__m256i test_mm512_maskz_cvtps_ph (__mmask16 k, __m512 a)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtps_ph
-  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
   return _mm512_maskz_cvtps_ph( k, a,_MM_FROUND_TO_ZERO);
 }
 
-__m512i test_mm512_cvtps_epu32 ( __m512 __A) 
+//
+__m512i test_mm512_cvtps_epu32 ( __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_cvtps_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
   return _mm512_cvtps_epu32(__A);
 }
 
+//
 __m512i test_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_cvtps_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
   return _mm512_mask_cvtps_epu32( __W, __U, __A);
 }
+//
 __m512i test_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_cvtps_epu32
-  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
   return _mm512_maskz_cvtps_epu32( __U, __A);
 }
 
+//
 double test_mm512_cvtsd_f64(__m512d A) {
-  // CHECK-LABEL: test_mm512_cvtsd_f64
-  // CHECK: extractelement <8 x double> %{{.*}}, i32 0
   return _mm512_cvtsd_f64(A);
 }
 
+//
 float test_mm512_cvtss_f32(__m512 A) {
-  // CHECK-LABEL: test_mm512_cvtss_f32
-  // CHECK: extractelement <16 x float> %{{.*}}, i32 0
   return _mm512_cvtss_f32(A);
 }
 
+//
 __m512d test_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_pd 
-  // CHECK: @llvm.x86.avx512.max.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_max_pd (__W,__U,__A,__B);
 }
 
+//
 __m512d test_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_pd 
-  // CHECK: @llvm.x86.avx512.max.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_max_pd (__U,__A,__B);
 }
 
+//
 __m512 test_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_ps 
-  // CHECK: @llvm.x86.avx512.max.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_max_ps (__W,__U,__A,__B);
 }
 
+//
 __m512d test_mm512_mask_max_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_round_pd
-  // CHECK: @llvm.x86.avx512.max.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_max_round_pd(__W,__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_max_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_round_pd
-  // CHECK: @llvm.x86.avx512.max.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_max_round_pd(__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_max_round_pd(__m512d __A,__m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_max_round_pd
-  // CHECK: @llvm.x86.avx512.max.pd.512
   return _mm512_max_round_pd(__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_ps 
-  // CHECK: @llvm.x86.avx512.max.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_max_ps (__U,__A,__B);
 }
 
+//
 __m512 test_mm512_mask_max_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_round_ps
-  // CHECK: @llvm.x86.avx512.max.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_max_round_ps(__W,__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_max_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_round_ps
-  // CHECK: @llvm.x86.avx512.max.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_max_round_ps(__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_max_round_ps(__m512 __A,__m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_max_round_ps
-  // CHECK: @llvm.x86.avx512.max.ps.512
   return _mm512_max_round_ps(__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_pd 
-  // CHECK: @llvm.x86.avx512.min.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_min_pd (__W,__U,__A,__B);
 }
 
+//
 __m512d test_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_pd 
-  // CHECK: @llvm.x86.avx512.min.pd.512
   return _mm512_maskz_min_pd (__U,__A,__B);
 }
 
+//
 __m512d test_mm512_mask_min_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_round_pd
-  // CHECK: @llvm.x86.avx512.min.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_mask_min_round_pd(__W,__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_min_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_round_pd
-  // CHECK: @llvm.x86.avx512.min.pd.512
-  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
   return _mm512_maskz_min_round_pd(__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_min_round_pd( __m512d __A,__m512d __B)
 {
-  // CHECK-LABEL: @test_mm512_min_round_pd
-  // CHECK: @llvm.x86.avx512.min.pd.512
   return _mm512_min_round_pd(__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_ps 
-  // CHECK: @llvm.x86.avx512.min.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_min_ps (__W,__U,__A,__B);
 }
 
+//
 __m512 test_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_ps 
-  // CHECK: @llvm.x86.avx512.min.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_min_ps (__U,__A,__B);
 }
 
+//
 __m512 test_mm512_mask_min_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_round_ps
-  // CHECK: @llvm.x86.avx512.min.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_min_round_ps(__W,__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_maskz_min_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_round_ps
-  // CHECK: @llvm.x86.avx512.min.ps.512
-  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_maskz_min_round_ps(__U,__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_min_round_ps(__m512 __A,__m512 __B)
 {
-  // CHECK-LABEL: @test_mm512_min_round_ps
-  // CHECK: @llvm.x86.avx512.min.ps.512
   return _mm512_min_round_ps(__A,__B,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_ps 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
   return _mm512_mask_floor_ps (__W,__U,__A);
 }
 
+//
 __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_floor_pd 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
   return _mm512_mask_floor_pd (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_ceil_ps 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
   return _mm512_mask_ceil_ps (__W,__U,__A);
 }
 
+//
 __m512d test_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_ceil_pd 
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
   return _mm512_mask_ceil_pd (__W,__U,__A);
 }
 
-__m512 test_mm512_mask_roundscale_ps(__m512 __W, __mmask16 __U, __m512 __A) 
+//
+__m512 test_mm512_mask_roundscale_ps(__m512 __W, __mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_roundscale_ps
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
   return _mm512_mask_roundscale_ps(__W,__U,__A, 1);
 }
 
-__m512 test_mm512_maskz_roundscale_ps(__mmask16 __U, __m512 __A) 
+//
+__m512 test_mm512_maskz_roundscale_ps(__mmask16 __U, __m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_roundscale_ps
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
   return _mm512_maskz_roundscale_ps(__U,__A, 1);
 }
 
+//
 __m512 test_mm512_mask_roundscale_round_ps(__m512 __A,__mmask16 __U,__m512 __C)
 {
-  // CHECK-LABEL: @test_mm512_mask_roundscale_round_ps
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
   return _mm512_mask_roundscale_round_ps(__A,__U,__C,_MM_FROUND_TO_ZERO,_MM_FROUND_NO_EXC);
 }
 
-__m512 test_mm512_maskz_roundscale_round_ps(__m512 __A,__mmask16 __U) 
+//
+__m512 test_mm512_maskz_roundscale_round_ps(__m512 __A,__mmask16 __U)
 {
-  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_ps
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
   return _mm512_maskz_roundscale_round_ps(__U,__A,_MM_FROUND_TO_ZERO,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512 test_mm512_roundscale_round_ps(__m512 __A)
 {
-  // CHECK-LABEL: @test_mm512_roundscale_round_ps
-  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
   return _mm512_roundscale_round_ps(__A,_MM_FROUND_TO_ZERO,_MM_FROUND_NO_EXC);
 }
 
-__m512d test_mm512_mask_roundscale_pd(__m512d __W, __mmask8 __U, __m512d __A) 
+//
+__m512d test_mm512_mask_roundscale_pd(__m512d __W, __mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_roundscale_pd
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
   return _mm512_mask_roundscale_pd(__W,__U,__A, 1);
 }
 
-__m512d test_mm512_maskz_roundscale_pd(__mmask8 __U, __m512d __A) 
+//
+__m512d test_mm512_maskz_roundscale_pd(__mmask8 __U, __m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_roundscale_pd
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
   return _mm512_maskz_roundscale_pd(__U,__A, 1);
 }
 
+//
 __m512d test_mm512_mask_roundscale_round_pd(__m512d __A,__mmask8 __U,__m512d __C)
 {
-  // CHECK-LABEL: @test_mm512_mask_roundscale_round_pd
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
   return _mm512_mask_roundscale_round_pd(__A,__U,__C,_MM_FROUND_TO_ZERO,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_maskz_roundscale_round_pd(__m512d __A,__mmask8 __U)
 {
-  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_pd
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
   return _mm512_maskz_roundscale_round_pd(__U,__A,_MM_FROUND_TO_ZERO,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512d test_mm512_roundscale_round_pd(__m512d __A)
 {
-  // CHECK-LABEL: @test_mm512_roundscale_round_pd
-  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
   return _mm512_roundscale_round_pd(__A,_MM_FROUND_TO_ZERO,_MM_FROUND_NO_EXC);
 }
 
+//
 __m512i test_mm512_max_epi32 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_max_epi32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_max_epi32 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_epi32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_max_epi32 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_epi32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_max_epi32 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_max_epi64 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_max_epi64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_max_epi64 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_epi64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_max_epi64 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_epi64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_max_epi64 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_max_epu64 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_max_epu64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_max_epu64 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_epu64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_max_epu64 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_epu64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_max_epu64 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_max_epu32 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_max_epu32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_max_epu32 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_max_epu32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_max_epu32 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_max_epu32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_max_epu32 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_min_epi32 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_min_epi32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_min_epi32 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_epi32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_min_epi32 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_epi32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_min_epi32 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_min_epu32 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_min_epu32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
   return _mm512_min_epu32 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_epu32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_mask_min_epu32 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_epu32 
-  // CHECK:       [[RES:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}})
-  // CHECK:       select <16 x i1> {{.*}}, <16 x i32> [[RES]], <16 x i32> {{.*}}
   return _mm512_maskz_min_epu32 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_min_epi64 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_min_epi64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_min_epi64 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_epi64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_min_epi64 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_epi64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_min_epi64 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_min_epu64 (__m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_min_epu64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
   return _mm512_min_epu64 (__A,__B);
 }
 
+//
 __m512i test_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_mask_min_epu64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_mask_min_epu64 (__W,__M,__A,__B);
 }
 
+//
 __m512i test_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 {
-  // CHECK-LABEL: @test_mm512_maskz_min_epu64 
-  // CHECK:       [[RES:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}})
-  // CHECK:       select <8 x i1> {{.*}}, <8 x i64> [[RES]], <8 x i64> {{.*}}
   return _mm512_maskz_min_epu64 (__M,__A,__B);
 }
 
+//
 __m512i test_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_set1_epi32
-  // CHECK: insertelement <16 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 15
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_set1_epi32 ( __O, __M, __A);
 }
 
+//
 __m512i test_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{     
-  // CHECK-LABEL: @test_mm512_maskz_set1_epi32
-  // CHECK: insertelement <16 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i32> %{{.*}}, i32 %{{.*}}, i32 15
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+{
     return _mm512_maskz_set1_epi32(__M, __A);
 }
 
 
+//
 __m512i test_mm512_set_epi8(char e63, char e62, char e61, char e60, char e59,
     char e58, char e57, char e56, char e55, char e54, char e53, char e52,
     char e51, char e50, char e49, char e48, char e47, char e46, char e45,
@@ -10132,71 +7204,6 @@ __m512i test_mm512_set_epi8(char e63, char e62, char e61, char e60, char e59,
     char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2,
     char e1, char e0) {
 
-  //CHECK-LABEL: @test_mm512_set_epi8
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
-  //CHECK: load i8, ptr %{{.*}}, align 1
   return _mm512_set_epi8(e63, e62, e61, e60, e59, e58, e57, e56, e55, e54,
       e53, e52, e51, e50, e49, e48,e47, e46, e45, e44, e43, e42, e41, e40,
       e39, e38, e37, e36, e35, e34, e33, e32,e31, e30, e29, e28, e27, e26,
@@ -10204,677 +7211,384 @@ __m512i test_mm512_set_epi8(char e63, char e62, char e61, char e60, char e59,
       e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
 }
 
+//
 __m512i test_mm512_set_epi16(short e31, short e30, short e29, short e28,
     short e27, short e26, short e25, short e24, short e23, short e22,
     short e21, short e20, short e19, short e18, short e17,
     short e16, short e15, short e14, short e13, short e12,
     short e11, short e10, short e9, short e8, short e7,
     short e6, short e5, short e4, short e3, short e2, short e1, short e0) {
-  //CHECK-LABEL: @test_mm512_set_epi16
-  //CHECK: insertelement{{.*}}i32 0
-  //CHECK: insertelement{{.*}}i32 1
-  //CHECK: insertelement{{.*}}i32 2
-  //CHECK: insertelement{{.*}}i32 3
-  //CHECK: insertelement{{.*}}i32 4
-  //CHECK: insertelement{{.*}}i32 5
-  //CHECK: insertelement{{.*}}i32 6
-  //CHECK: insertelement{{.*}}i32 7
-  //CHECK: insertelement{{.*}}i32 8
-  //CHECK: insertelement{{.*}}i32 9
-  //CHECK: insertelement{{.*}}i32 10
-  //CHECK: insertelement{{.*}}i32 11
-  //CHECK: insertelement{{.*}}i32 12
-  //CHECK: insertelement{{.*}}i32 13
-  //CHECK: insertelement{{.*}}i32 14
-  //CHECK: insertelement{{.*}}i32 15
-  //CHECK: insertelement{{.*}}i32 16
-  //CHECK: insertelement{{.*}}i32 17
-  //CHECK: insertelement{{.*}}i32 18
-  //CHECK: insertelement{{.*}}i32 19
-  //CHECK: insertelement{{.*}}i32 20
-  //CHECK: insertelement{{.*}}i32 21
-  //CHECK: insertelement{{.*}}i32 22
-  //CHECK: insertelement{{.*}}i32 23
-  //CHECK: insertelement{{.*}}i32 24
-  //CHECK: insertelement{{.*}}i32 25
-  //CHECK: insertelement{{.*}}i32 26
-  //CHECK: insertelement{{.*}}i32 27
-  //CHECK: insertelement{{.*}}i32 28
-  //CHECK: insertelement{{.*}}i32 29
-  //CHECK: insertelement{{.*}}i32 30
-  //CHECK: insertelement{{.*}}i32 31
   return _mm512_set_epi16(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22,
       e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7,
       e6, e5, e4, e3, e2, e1, e0);
 
 }
+//
 __m512i test_mm512_set_epi32 (int __A, int __B, int __C, int __D,
                int __E, int __F, int __G, int __H,
                int __I, int __J, int __K, int __L,
                int __M, int __N, int __O, int __P)
 {
- //CHECK-LABEL: @test_mm512_set_epi32
- //CHECK: insertelement{{.*}}i32 0
- //CHECK: insertelement{{.*}}i32 1
- //CHECK: insertelement{{.*}}i32 2
- //CHECK: insertelement{{.*}}i32 3
- //CHECK: insertelement{{.*}}i32 4
- //CHECK: insertelement{{.*}}i32 5
- //CHECK: insertelement{{.*}}i32 6
- //CHECK: insertelement{{.*}}i32 7
- //CHECK: insertelement{{.*}}i32 8
- //CHECK: insertelement{{.*}}i32 9
- //CHECK: insertelement{{.*}}i32 10
- //CHECK: insertelement{{.*}}i32 11
- //CHECK: insertelement{{.*}}i32 12
- //CHECK: insertelement{{.*}}i32 13
- //CHECK: insertelement{{.*}}i32 14
- //CHECK: insertelement{{.*}}i32 15
  return _mm512_set_epi32( __A, __B, __C, __D,__E, __F, __G, __H,
               __I, __J, __K, __L,__M, __N, __O, __P);
 }
 
+//
 __m512i test_mm512_setr_epi32 (int __A, int __B, int __C, int __D,
                int __E, int __F, int __G, int __H,
                int __I, int __J, int __K, int __L,
                int __M, int __N, int __O, int __P)
 {
- //CHECK-LABEL: @test_mm512_setr_epi32
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: load{{.*}}%{{.*}}, align 4
- //CHECK: insertelement{{.*}}i32 0
- //CHECK: insertelement{{.*}}i32 1
- //CHECK: insertelement{{.*}}i32 2
- //CHECK: insertelement{{.*}}i32 3
- //CHECK: insertelement{{.*}}i32 4
- //CHECK: insertelement{{.*}}i32 5
- //CHECK: insertelement{{.*}}i32 6
- //CHECK: insertelement{{.*}}i32 7
- //CHECK: insertelement{{.*}}i32 8
- //CHECK: insertelement{{.*}}i32 9
- //CHECK: insertelement{{.*}}i32 10
- //CHECK: insertelement{{.*}}i32 11
- //CHECK: insertelement{{.*}}i32 12
- //CHECK: insertelement{{.*}}i32 13
- //CHECK: insertelement{{.*}}i32 14
- //CHECK: insertelement{{.*}}i32 15
  return _mm512_setr_epi32( __A, __B, __C, __D,__E, __F, __G, __H,
               __I, __J, __K, __L,__M, __N, __O, __P);
 }
 
+//
 __m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_set1_epi64
-  // CHECK: insertelement <8 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 7
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_set1_epi64 (__O, __M, __A);
 }
 
+//
 __m512i test_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_set1_epi64
-  // CHECK: insertelement <8 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i64> %{{.*}}, i64 %{{.*}}, i32 7
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_set1_epi64 (__M, __A);
 }
 
 
+//
 __m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C,
                               long long __D, long long __E, long long __F,
                               long long __G, long long __H)
 {
-    //CHECK-LABEL: @test_mm512_set_epi64
-    //CHECK: insertelement{{.*}}i32 0
-    //CHECK: insertelement{{.*}}i32 1
-    //CHECK: insertelement{{.*}}i32 2
-    //CHECK: insertelement{{.*}}i32 3
-    //CHECK: insertelement{{.*}}i32 4
-    //CHECK: insertelement{{.*}}i32 5
-    //CHECK: insertelement{{.*}}i32 6
-    //CHECK: insertelement{{.*}}i32 7
   return _mm512_set_epi64(__A, __B, __C, __D, __E, __F, __G, __H );
 }
 
+//
 __m512i test_mm512_setr_epi64 (long long __A, long long __B, long long __C,
                               long long __D, long long __E, long long __F,
                               long long __G, long long __H)
 {
-    //CHECK-LABEL: @test_mm512_setr_epi64
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: insertelement{{.*}}i32 0
-    //CHECK: insertelement{{.*}}i32 1
-    //CHECK: insertelement{{.*}}i32 2
-    //CHECK: insertelement{{.*}}i32 3
-    //CHECK: insertelement{{.*}}i32 4
-    //CHECK: insertelement{{.*}}i32 5
-    //CHECK: insertelement{{.*}}i32 6
-    //CHECK: insertelement{{.*}}i32 7
   return _mm512_setr_epi64(__A, __B, __C, __D, __E, __F, __G, __H );
 }
 
+//
 __m512d test_mm512_set_pd (double __A, double __B, double __C, double __D,
                            double __E, double __F, double __G, double __H)
 {
-    //CHECK-LABEL: @test_mm512_set_pd
-    //CHECK: insertelement{{.*}}i32 0
-    //CHECK: insertelement{{.*}}i32 1
-    //CHECK: insertelement{{.*}}i32 2
-    //CHECK: insertelement{{.*}}i32 3
-    //CHECK: insertelement{{.*}}i32 4
-    //CHECK: insertelement{{.*}}i32 5
-    //CHECK: insertelement{{.*}}i32 6
-    //CHECK: insertelement{{.*}}i32 7
   return _mm512_set_pd( __A, __B, __C, __D, __E, __F, __G, __H);
 }
 
+//
 __m512d test_mm512_setr_pd (double __A, double __B, double __C, double __D,
                            double __E, double __F, double __G, double __H)
 {
-    //CHECK-LABEL: @test_mm512_setr_pd
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: load{{.*}}%{{.*}}, align 8
-    //CHECK: insertelement{{.*}}i32 0
-    //CHECK: insertelement{{.*}}i32 1
-    //CHECK: insertelement{{.*}}i32 2
-    //CHECK: insertelement{{.*}}i32 3
-    //CHECK: insertelement{{.*}}i32 4
-    //CHECK: insertelement{{.*}}i32 5
-    //CHECK: insertelement{{.*}}i32 6
-    //CHECK: insertelement{{.*}}i32 7
   return _mm512_setr_pd( __A, __B, __C, __D, __E, __F, __G, __H);
 }
 
+//
 __m512 test_mm512_set_ps (float __A, float __B, float __C, float __D,
                           float __E, float __F, float __G, float __H,
                           float __I, float __J, float __K, float __L,
                           float __M, float __N, float __O, float __P)
 {
-    //CHECK-LABEL: @test_mm512_set_ps
-    //CHECK: insertelement{{.*}}i32 0
-    //CHECK: insertelement{{.*}}i32 1
-    //CHECK: insertelement{{.*}}i32 2
-    //CHECK: insertelement{{.*}}i32 3
-    //CHECK: insertelement{{.*}}i32 4
-    //CHECK: insertelement{{.*}}i32 5
-    //CHECK: insertelement{{.*}}i32 6
-    //CHECK: insertelement{{.*}}i32 7
-    //CHECK: insertelement{{.*}}i32 8
-    //CHECK: insertelement{{.*}}i32 9
-    //CHECK: insertelement{{.*}}i32 10
-    //CHECK: insertelement{{.*}}i32 11
-    //CHECK: insertelement{{.*}}i32 12
-    //CHECK: insertelement{{.*}}i32 13
-    //CHECK: insertelement{{.*}}i32 14
-    //CHECK: insertelement{{.*}}i32 15
     return _mm512_set_ps( __A, __B, __C, __D, __E, __F, __G, __H,
                           __I, __J, __K, __L, __M, __N, __O, __P);
 }
 
+//
 __m512i test_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_abs_epi64 
-  // CHECK: [[ABS:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %{{.*}}, i1 false)
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> [[ABS]], <8 x i64> %{{.*}}
   return _mm512_mask_abs_epi64 (__W,__U,__A);
 }
 
+//
 __m512i test_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_abs_epi64 
-  // CHECK: [[ABS:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %{{.*}}, i1 false)
-  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> [[ABS]], <8 x i64> %{{.*}}
   return _mm512_maskz_abs_epi64 (__U,__A);
 }
 
+//
 __m512i test_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_mask_abs_epi32
-  // CHECK: [[ABS:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %{{.*}}, i1 false)
-  // CHECK: [[TMP:%.*]] = bitcast <16 x i32> [[ABS]] to <8 x i64>
-  // CHECK: [[ABS:%.*]] = bitcast <8 x i64> [[TMP]] to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> [[ABS]], <16 x i32> %{{.*}}
   return _mm512_mask_abs_epi32 (__W,__U,__A);
 }
 
+//
 __m512i test_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
 {
-  // CHECK-LABEL: @test_mm512_maskz_abs_epi32
-  // CHECK: [[ABS:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %{{.*}}, i1 false)
-  // CHECK: [[TMP:%.*]] = bitcast <16 x i32> [[ABS]] to <8 x i64>
-  // CHECK: [[ABS:%.*]] = bitcast <8 x i64> [[TMP]] to <16 x i32>
-  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> [[ABS]], <16 x i32> %{{.*}}
   return _mm512_maskz_abs_epi32 (__U,__A);
 }
 
+//
 __m512 test_mm512_setr_ps (float __A, float __B, float __C, float __D,
                           float __E, float __F, float __G, float __H,
                           float __I, float __J, float __K, float __L,
                           float __M, float __N, float __O, float __P)
 {
-    //CHECK-LABEL: @test_mm512_setr_ps
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: load{{.*}}%{{.*}}, align 4
-    //CHECK: insertelement{{.*}}i32 0
-    //CHECK: insertelement{{.*}}i32 1
-    //CHECK: insertelement{{.*}}i32 2
-    //CHECK: insertelement{{.*}}i32 3
-    //CHECK: insertelement{{.*}}i32 4
-    //CHECK: insertelement{{.*}}i32 5
-    //CHECK: insertelement{{.*}}i32 6
-    //CHECK: insertelement{{.*}}i32 7
-    //CHECK: insertelement{{.*}}i32 8
-    //CHECK: insertelement{{.*}}i32 9
-    //CHECK: insertelement{{.*}}i32 10
-    //CHECK: insertelement{{.*}}i32 11
-    //CHECK: insertelement{{.*}}i32 12
-    //CHECK: insertelement{{.*}}i32 13
-    //CHECK: insertelement{{.*}}i32 14
-    //CHECK: insertelement{{.*}}i32 15
     return _mm512_setr_ps( __A, __B, __C, __D, __E, __F, __G, __H,
                           __I, __J, __K, __L, __M, __N, __O, __P);
 }
 
+//
 int test_mm_cvtss_i32(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtss_i32
-  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
   return _mm_cvtss_i32(A);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvtss_i64(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtss_i64
-  // CHECK: call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
   return _mm_cvtss_i64(A);
 }
 #endif
 
+//
 __m128d test_mm_cvti32_sd(__m128d A, int B) {
-  // CHECK-LABEL: test_mm_cvti32_sd
-  // CHECK: sitofp i32 %{{.*}} to double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvti32_sd(A, B);
 }
 
 #ifdef __x86_64__
+//
 __m128d test_mm_cvti64_sd(__m128d A, long long B) {
-  // CHECK-LABEL: test_mm_cvti64_sd
-  // CHECK: sitofp i64 %{{.*}} to double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvti64_sd(A, B);
 }
 #endif
 
+//
 __m128 test_mm_cvti32_ss(__m128 A, int B) {
-  // CHECK-LABEL: test_mm_cvti32_ss
-  // CHECK: sitofp i32 %{{.*}} to float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_cvti32_ss(A, B);
 }
 
 #ifdef __x86_64__
+//
 __m128 test_mm_cvti64_ss(__m128 A, long long B) {
-  // CHECK-LABEL: test_mm_cvti64_ss
-  // CHECK: sitofp i64 %{{.*}} to float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_cvti64_ss(A, B);
 }
 #endif
 
+//
 int test_mm_cvtsd_i32(__m128d A) {
-  // CHECK-LABEL: test_mm_cvtsd_i32
-  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %{{.*}})
   return _mm_cvtsd_i32(A);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvtsd_i64(__m128d A) {
-  // CHECK-LABEL: test_mm_cvtsd_i64
-  // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
   return _mm_cvtsd_i64(A);
 }
 #endif
 
+//
 __m128d test_mm_mask_cvtss_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_mask_cvtss_sd
-  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
-  return _mm_mask_cvtss_sd(__W, __U, __A, __B); 
+  return _mm_mask_cvtss_sd(__W, __U, __A, __B);
 }
 
+//
 __m128d test_mm_maskz_cvtss_sd( __mmask8 __U, __m128d __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm_maskz_cvtss_sd
-  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
-  return _mm_maskz_cvtss_sd( __U, __A, __B); 
+  return _mm_maskz_cvtss_sd( __U, __A, __B);
 }
 
+//
 __m128 test_mm_mask_cvtsd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_mask_cvtsd_ss
-  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
-  return _mm_mask_cvtsd_ss(__W, __U, __A, __B); 
+  return _mm_mask_cvtsd_ss(__W, __U, __A, __B);
 }
 
+//
 __m128 test_mm_maskz_cvtsd_ss(__mmask8 __U, __m128 __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm_maskz_cvtsd_ss
-  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
-  return _mm_maskz_cvtsd_ss(__U, __A, __B); 
+  return _mm_maskz_cvtsd_ss(__U, __A, __B);
 }
 
 
+//
 __m512i test_mm512_setzero_epi32(void)
 {
-  // CHECK-LABEL: @test_mm512_setzero_epi32
-  // CHECK: zeroinitializer
   return _mm512_setzero_epi32();
 }
 
+//
 __m512 test_mm512_setzero(void)
 {
-  // CHECK-LABEL: @test_mm512_setzero
-  // CHECK: zeroinitializer
   return _mm512_setzero();
 }
 
+//
 __m512i test_mm512_setzero_si512(void)
 {
-  // CHECK-LABEL: @test_mm512_setzero_si512
-  // CHECK: zeroinitializer
   return _mm512_setzero_si512();
 }
 
+//
 __m512 test_mm512_setzero_ps(void)
 {
-  // CHECK-LABEL: @test_mm512_setzero_ps
-  // CHECK: zeroinitializer
   return _mm512_setzero_ps();
 }
 
+//
 __m512d test_mm512_setzero_pd(void)
 {
-  // CHECK-LABEL: @test_mm512_setzero_pd
-  // CHECK: zeroinitializer
   return _mm512_setzero_pd();
 }
 
+//
 __mmask16 test_mm512_int2mask(int __a)
 {
-  // CHECK-LABEL: test_mm512_int2mask
-  // CHECK: trunc i32 %{{.*}} to i16
   return _mm512_int2mask(__a);
 }
 
+//
 int test_mm512_mask2int(__mmask16 __a)
 {
-  // CHECK-LABEL: test_mm512_mask2int
-  // CHECK: zext i16 %{{.*}} to i32
   return _mm512_mask2int(__a);
 }
 
+//
 __m128 test_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 {
-  // CHECK-LABEL: @test_mm_mask_move_ss
-  // CHECK: [[EXT:%.*]] = extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: insertelement <4 x float> %{{.*}}, float [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, float [[A]], float [[B]]
-  // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0
   return _mm_mask_move_ss ( __W,  __U,  __A,  __B);
 }
 
+//
 __m128 test_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
 {
-  // CHECK-LABEL: @test_mm_maskz_move_ss
-  // CHECK: [[EXT:%.*]] = extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: insertelement <4 x float> %{{.*}}, float [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <4 x float> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, float [[A]], float [[B]]
-  // CHECK-NEXT: insertelement <4 x float> [[VEC]], float [[SEL]], i64 0
   return _mm_maskz_move_ss (__U, __A, __B);
 }
 
+//
 __m128d test_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 {
-  // CHECK-LABEL: @test_mm_mask_move_sd
-  // CHECK: [[EXT:%.*]] = extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <2 x double> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, double [[A]], double [[B]]
-  // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0
   return _mm_mask_move_sd ( __W,  __U,  __A,  __B);
 }
 
+//
 __m128d test_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
 {
-  // CHECK-LABEL: @test_mm_maskz_move_sd
-  // CHECK: [[EXT:%.*]] = extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <2 x double> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %13, double [[A]], double [[B]]
-  // CHECK-NEXT: insertelement <2 x double> [[VEC]], double [[SEL]], i64 0
   return _mm_maskz_move_sd (__U, __A, __B);
 }
 
+//
 void test_mm_mask_store_ss(float * __P, __mmask8 __U, __m128 __A)
 {
-  // CHECK-LABEL: @test_mm_mask_store_ss
-  // CHECK: call void @llvm.masked.store.v4f32.p0(<4 x float> %{{.*}}, ptr %{{.*}}, i32 1, <4 x i1> %{{.*}})
   _mm_mask_store_ss(__P, __U, __A);
 }
 
+//
 void test_mm_mask_store_sd(double * __P, __mmask8 __U, __m128d __A)
 {
-  // CHECK-LABEL: @test_mm_mask_store_sd
-  // CHECK: call void @llvm.masked.store.v2f64.p0(<2 x double> %{{.*}}, ptr %{{.*}}, i32 1, <2 x i1> %{{.*}})
   _mm_mask_store_sd(__P, __U, __A);
 }
 
+//
 __m128 test_mm_mask_load_ss(__m128 __A, __mmask8 __U, const float* __W)
 {
-  // CHECK-LABEL: @test_mm_mask_load_ss
-  // CHECK: call <4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_mask_load_ss(__A, __U, __W);
 }
 
+//
 __m128 test_mm_maskz_load_ss (__mmask8 __U, const float * __W)
 {
-  // CHECK-LABEL: @test_mm_maskz_load_ss
-  // CHECK: call <4 x float> @llvm.masked.load.v4f32.p0(ptr %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
   return _mm_maskz_load_ss (__U, __W);
 }
 
+//
 __m128d test_mm_mask_load_sd (__m128d __A, __mmask8 __U, const double * __W)
 {
-  // CHECK-LABEL: @test_mm_mask_load_sd
-  // CHECK: call <2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_mask_load_sd (__A, __U, __W);
 }
 
+//
 __m128d test_mm_maskz_load_sd (__mmask8 __U, const double * __W)
 {
-  // CHECK-LABEL: @test_mm_maskz_load_sd
-  // CHECK: call <2 x double> @llvm.masked.load.v2f64.p0(ptr %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
   return _mm_maskz_load_sd (__U, __W);
 }
 
+//
 __m512d test_mm512_abs_pd(__m512d a){
-  // CHECK-LABEL: @test_mm512_abs_pd
-  // CHECK: and <8 x i64> 
   return _mm512_abs_pd(a);
 }
 
+//
 __m512d test_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A){
-  // CHECK-LABEL: @test_mm512_mask_abs_pd 
-  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
-  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_abs_pd (__W,__U,__A);
 }
 
+//
 __m512 test_mm512_abs_ps(__m512 a){
-  // CHECK-LABEL: @test_mm512_abs_ps
-  // CHECK: and <16 x i32> 
   return _mm512_abs_ps(a);
 }
 
+//
 __m512 test_mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A){
-  // CHECK-LABEL: @test_mm512_mask_abs_ps
-  // CHECK: and <16 x i32> 
-  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_abs_ps( __W, __U, __A);
 }
 
+//
 __m512d test_mm512_zextpd128_pd512(__m128d A) {
-  // CHECK-LABEL: test_mm512_zextpd128_pd512
-  // CHECK: store <2 x double> zeroinitializer
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   return _mm512_zextpd128_pd512(A);
 }
 
+//
 __m512d test_mm512_zextpd256_pd512(__m256d A) {
-  // CHECK-LABEL: test_mm512_zextpd256_pd512
-  // CHECK: store <4 x double> zeroinitializer
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_zextpd256_pd512(A);
 }
 
+//
 __m512 test_mm512_zextps128_ps512(__m128 A) {
-  // CHECK-LABEL: test_mm512_zextps128_ps512
-  // CHECK: store <4 x float> zeroinitializer
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
   return _mm512_zextps128_ps512(A);
 }
 
+//
 __m512 test_mm512_zextps256_ps512(__m256 A) {
-  // CHECK-LABEL: test_mm512_zextps256_ps512
-  // CHECK: store <8 x float> zeroinitializer
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_zextps256_ps512(A);
 }
 
+//
 __m512i test_mm512_zextsi128_si512(__m128i A) {
-  // CHECK-LABEL: test_mm512_zextsi128_si512
-  // CHECK: store <2 x i64> zeroinitializer
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   return _mm512_zextsi128_si512(A);
 }
 
+//
 __m512i test_mm512_zextsi256_si512(__m256i A) {
-  // CHECK-LABEL: test_mm512_zextsi256_si512
-  // CHECK: store <4 x i64> zeroinitializer
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_zextsi256_si512(A);
 }
 
+//
 __m512d test_mm512_i32logather_pd(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i32logather_pd
-  // CHECK: @llvm.x86.avx512.mask.gather.dpd.512
   return _mm512_i32logather_pd(__index, __addr, 2);
 }
 
+//
 __m512d test_mm512_mask_i32logather_pd(__m512d __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i32logather_pd
-  // CHECK: @llvm.x86.avx512.mask.gather.dpd.512
   return _mm512_mask_i32logather_pd(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 void test_mm512_i32loscatter_pd(void *__addr, __m512i __index, __m512d __v1) {
-  // CHECK-LABEL: @test_mm512_i32loscatter_pd
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpd.512
   return _mm512_i32loscatter_pd(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i32loscatter_pd(void *__addr, __mmask8 __mask, __m512i __index, __m512d __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i32loscatter_pd
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpd.512
   return _mm512_mask_i32loscatter_pd(__addr, __mask, __index, __v1, 2);
 }
 
+//
 __m512i test_mm512_i32logather_epi64(__m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_i32logather_epi64
-  // CHECK: @llvm.x86.avx512.mask.gather.dpq.512
   return _mm512_i32logather_epi64(__index, __addr, 2);
 }
 
+//
 __m512i test_mm512_mask_i32logather_epi64(__m512i __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
-  // CHECK-LABEL: @test_mm512_mask_i32logather_epi64
-  // CHECK: @llvm.x86.avx512.mask.gather.dpq.512
   return _mm512_mask_i32logather_epi64(__v1_old, __mask, __index, __addr, 2);
 }
 
+//
 void test_mm512_i32loscatter_epi64(void *__addr, __m512i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_i32loscatter_epi64
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
   _mm512_i32loscatter_epi64(__addr, __index, __v1, 2);
 }
 
+//
 void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i __index, __m512i __v1) {
-  // CHECK-LABEL: @test_mm512_mask_i32loscatter_epi64
-  // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
   _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2);
 }
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
index a766476ca92bd1..a913401d3f74d1 100644
--- a/clang/test/CodeGen/X86/avx512fp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -1,108 +1,462 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -ffreestanding -flax-vector-conversions=none %s -triple=x86_64-unknown-unknown -target-feature +avx512fp16 -emit-llvm -o - -Wall -Werror | FileCheck %s
 
 #include <immintrin.h>
 
+// CHECK-LABEL: define dso_local half @test_mm512_cvtsh_h(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <32 x half> [[TMP1]], i32 0
+// CHECK-NEXT:    ret half [[VECEXT_I]]
+//
 _Float16 test_mm512_cvtsh_h(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_cvtsh_h
-  // CHECK: extractelement <32 x half> %{{.*}}, i32 0
   return _mm512_cvtsh_h(__A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_setzero_ph(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
 __m128h test_mm_setzero_ph(void) {
-  // CHECK-LABEL: @test_mm_setzero_ph
-  // CHECK: zeroinitializer
   return _mm_setzero_ph();
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_setzero_ph(
+// CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    ret <16 x half> [[TMP0]]
+//
 __m256h test_mm256_setzero_ph(void) {
-  // CHECK-LABEL: @test_mm256_setzero_ph
-  // CHECK: zeroinitializer
   return _mm256_setzero_ph();
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_undefined_ph(
+// CHECK-SAME: ) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <16 x half> zeroinitializer
+//
 __m256h test_mm256_undefined_ph(void) {
-  // CHECK-LABEL: @test_mm256_undefined_ph
-  // CHECK: ret <16 x half> zeroinitializer
   return _mm256_undefined_ph();
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_setzero_ph(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    ret <32 x half> [[TMP0]]
+//
 __m512h test_mm512_setzero_ph(void) {
-  // CHECK-LABEL: @test_mm512_setzero_ph
-  // CHECK: zeroinitializer
   return _mm512_setzero_ph();
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_undefined_ph(
+// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <8 x half> zeroinitializer
+//
 __m128h test_mm_undefined_ph(void) {
-  // CHECK-LABEL: @test_mm_undefined_ph
-  // CHECK: ret <8 x half> zeroinitializer
   return _mm_undefined_ph();
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_undefined_ph(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <32 x half> zeroinitializer
+//
 __m512h test_mm512_undefined_ph(void) {
-  // CHECK-LABEL: @test_mm512_undefined_ph
-  // CHECK: ret <32 x half> zeroinitializer
   return _mm512_undefined_ph();
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_set1_ph(
+// CHECK-SAME: half noundef [[H:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__H_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[H_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    store half [[H]], ptr [[H_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[H_ADDR]], align 2
+// CHECK-NEXT:    store half [[TMP0]], ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <32 x half> undef, half [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <32 x half> [[VECINIT_I]], half [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <32 x half> [[VECINIT1_I]], half [[TMP3]], i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <32 x half> [[VECINIT2_I]], half [[TMP4]], i32 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <32 x half> [[VECINIT3_I]], half [[TMP5]], i32 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <32 x half> [[VECINIT4_I]], half [[TMP6]], i32 5
+// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <32 x half> [[VECINIT5_I]], half [[TMP7]], i32 6
+// CHECK-NEXT:    [[TMP8:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <32 x half> [[VECINIT6_I]], half [[TMP8]], i32 7
+// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <32 x half> [[VECINIT7_I]], half [[TMP9]], i32 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <32 x half> [[VECINIT8_I]], half [[TMP10]], i32 9
+// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <32 x half> [[VECINIT9_I]], half [[TMP11]], i32 10
+// CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <32 x half> [[VECINIT10_I]], half [[TMP12]], i32 11
+// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <32 x half> [[VECINIT11_I]], half [[TMP13]], i32 12
+// CHECK-NEXT:    [[TMP14:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <32 x half> [[VECINIT12_I]], half [[TMP14]], i32 13
+// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <32 x half> [[VECINIT13_I]], half [[TMP15]], i32 14
+// CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <32 x half> [[VECINIT14_I]], half [[TMP16]], i32 15
+// CHECK-NEXT:    [[TMP17:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT16_I:%.*]] = insertelement <32 x half> [[VECINIT15_I]], half [[TMP17]], i32 16
+// CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT17_I:%.*]] = insertelement <32 x half> [[VECINIT16_I]], half [[TMP18]], i32 17
+// CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT18_I:%.*]] = insertelement <32 x half> [[VECINIT17_I]], half [[TMP19]], i32 18
+// CHECK-NEXT:    [[TMP20:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT19_I:%.*]] = insertelement <32 x half> [[VECINIT18_I]], half [[TMP20]], i32 19
+// CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT20_I:%.*]] = insertelement <32 x half> [[VECINIT19_I]], half [[TMP21]], i32 20
+// CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT21_I:%.*]] = insertelement <32 x half> [[VECINIT20_I]], half [[TMP22]], i32 21
+// CHECK-NEXT:    [[TMP23:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT22_I:%.*]] = insertelement <32 x half> [[VECINIT21_I]], half [[TMP23]], i32 22
+// CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT23_I:%.*]] = insertelement <32 x half> [[VECINIT22_I]], half [[TMP24]], i32 23
+// CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT24_I:%.*]] = insertelement <32 x half> [[VECINIT23_I]], half [[TMP25]], i32 24
+// CHECK-NEXT:    [[TMP26:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT25_I:%.*]] = insertelement <32 x half> [[VECINIT24_I]], half [[TMP26]], i32 25
+// CHECK-NEXT:    [[TMP27:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT26_I:%.*]] = insertelement <32 x half> [[VECINIT25_I]], half [[TMP27]], i32 26
+// CHECK-NEXT:    [[TMP28:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT27_I:%.*]] = insertelement <32 x half> [[VECINIT26_I]], half [[TMP28]], i32 27
+// CHECK-NEXT:    [[TMP29:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT28_I:%.*]] = insertelement <32 x half> [[VECINIT27_I]], half [[TMP29]], i32 28
+// CHECK-NEXT:    [[TMP30:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT29_I:%.*]] = insertelement <32 x half> [[VECINIT28_I]], half [[TMP30]], i32 29
+// CHECK-NEXT:    [[TMP31:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT30_I:%.*]] = insertelement <32 x half> [[VECINIT29_I]], half [[TMP31]], i32 30
+// CHECK-NEXT:    [[TMP32:%.*]] = load half, ptr [[__H_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT31_I:%.*]] = insertelement <32 x half> [[VECINIT30_I]], half [[TMP32]], i32 31
+// CHECK-NEXT:    store <32 x half> [[VECINIT31_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP33:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    ret <32 x half> [[TMP33]]
+//
 __m512h test_mm512_set1_ph(_Float16 h) {
-  // CHECK-LABEL: @test_mm512_set1_ph
-  // CHECK: insertelement <32 x half> {{.*}}, i32 0
-  // CHECK: insertelement <32 x half> {{.*}}, i32 1
-  // CHECK: insertelement <32 x half> {{.*}}, i32 2
-  // CHECK: insertelement <32 x half> {{.*}}, i32 3
-  // CHECK: insertelement <32 x half> {{.*}}, i32 4
-  // CHECK: insertelement <32 x half> {{.*}}, i32 5
-  // CHECK: insertelement <32 x half> {{.*}}, i32 6
-  // CHECK: insertelement <32 x half> {{.*}}, i32 7
-  // CHECK: insertelement <32 x half> {{.*}}, i32 8
-  // CHECK: insertelement <32 x half> {{.*}}, i32 9
-  // CHECK: insertelement <32 x half> {{.*}}, i32 10
-  // CHECK: insertelement <32 x half> {{.*}}, i32 11
-  // CHECK: insertelement <32 x half> {{.*}}, i32 12
-  // CHECK: insertelement <32 x half> {{.*}}, i32 13
-  // CHECK: insertelement <32 x half> {{.*}}, i32 14
-  // CHECK: insertelement <32 x half> {{.*}}, i32 15
-  // CHECK: insertelement <32 x half> {{.*}}, i32 16
-  // CHECK: insertelement <32 x half> {{.*}}, i32 17
-  // CHECK: insertelement <32 x half> {{.*}}, i32 18
-  // CHECK: insertelement <32 x half> {{.*}}, i32 19
-  // CHECK: insertelement <32 x half> {{.*}}, i32 20
-  // CHECK: insertelement <32 x half> {{.*}}, i32 21
-  // CHECK: insertelement <32 x half> {{.*}}, i32 22
-  // CHECK: insertelement <32 x half> {{.*}}, i32 23
-  // CHECK: insertelement <32 x half> {{.*}}, i32 24
-  // CHECK: insertelement <32 x half> {{.*}}, i32 25
-  // CHECK: insertelement <32 x half> {{.*}}, i32 26
-  // CHECK: insertelement <32 x half> {{.*}}, i32 27
-  // CHECK: insertelement <32 x half> {{.*}}, i32 28
-  // CHECK: insertelement <32 x half> {{.*}}, i32 29
-  // CHECK: insertelement <32 x half> {{.*}}, i32 30
-  // CHECK: insertelement <32 x half> {{.*}}, i32 31
   return _mm512_set1_ph(h);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_set1_pch(
+// CHECK-SAME: <2 x half> noundef [[H_COERCE:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[H_I:%.*]] = alloca { half, half }, align 2
+// CHECK-NEXT:    [[H:%.*]] = alloca { half, half }, align 2
+// CHECK-NEXT:    [[COERCE:%.*]] = alloca { half, half }, align 2
+// CHECK-NEXT:    store <2 x half> [[H_COERCE]], ptr [[H]], align 2
+// CHECK-NEXT:    [[H_REALP:%.*]] = getelementptr inbounds { half, half }, ptr [[H]], i32 0, i32 0
+// CHECK-NEXT:    [[H_REAL:%.*]] = load half, ptr [[H_REALP]], align 2
+// CHECK-NEXT:    [[H_IMAGP:%.*]] = getelementptr inbounds { half, half }, ptr [[H]], i32 0, i32 1
+// CHECK-NEXT:    [[H_IMAG:%.*]] = load half, ptr [[H_IMAGP]], align 2
+// CHECK-NEXT:    [[COERCE_REALP:%.*]] = getelementptr inbounds { half, half }, ptr [[COERCE]], i32 0, i32 0
+// CHECK-NEXT:    [[COERCE_IMAGP:%.*]] = getelementptr inbounds { half, half }, ptr [[COERCE]], i32 0, i32 1
+// CHECK-NEXT:    store half [[H_REAL]], ptr [[COERCE_REALP]], align 2
+// CHECK-NEXT:    store half [[H_IMAG]], ptr [[COERCE_IMAGP]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr [[COERCE]], align 2
+// CHECK-NEXT:    store <2 x half> [[TMP0]], ptr [[H_I]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[H_I]], align 2
+// CHECK-NEXT:    store float [[TMP1]], ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x float> undef, float [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x float> [[VECINIT_I]], float [[TMP3]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x float> [[VECINIT1_I]], float [[TMP4]], i32 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x float> [[VECINIT2_I]], float [[TMP5]], i32 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x float> [[VECINIT3_I]], float [[TMP6]], i32 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x float> [[VECINIT4_I]], float [[TMP7]], i32 5
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x float> [[VECINIT5_I]], float [[TMP8]], i32 6
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x float> [[VECINIT6_I]], float [[TMP9]], i32 7
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x float> [[VECINIT7_I]], float [[TMP10]], i32 8
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x float> [[VECINIT8_I]], float [[TMP11]], i32 9
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x float> [[VECINIT9_I]], float [[TMP12]], i32 10
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x float> [[VECINIT10_I]], float [[TMP13]], i32 11
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x float> [[VECINIT11_I]], float [[TMP14]], i32 12
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x float> [[VECINIT12_I]], float [[TMP15]], i32 13
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x float> [[VECINIT13_I]], float [[TMP16]], i32 14
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x float> [[VECINIT14_I]], float [[TMP17]], i32 15
+// CHECK-NEXT:    store <16 x float> [[VECINIT15_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x float> [[TMP18]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP19]]
+//
 __m512h test_mm512_set1_pch(_Float16 _Complex h) {
-  // CHECK-LABEL: @test_mm512_set1_pch
-  // CHECK: insertelement <16 x float> {{.*}}, i32 0
-  // CHECK: insertelement <16 x float> {{.*}}, i32 1
-  // CHECK: insertelement <16 x float> {{.*}}, i32 2
-  // CHECK: insertelement <16 x float> {{.*}}, i32 3
-  // CHECK: insertelement <16 x float> {{.*}}, i32 4
-  // CHECK: insertelement <16 x float> {{.*}}, i32 5
-  // CHECK: insertelement <16 x float> {{.*}}, i32 6
-  // CHECK: insertelement <16 x float> {{.*}}, i32 7
-  // CHECK: insertelement <16 x float> {{.*}}, i32 8
-  // CHECK: insertelement <16 x float> {{.*}}, i32 9
-  // CHECK: insertelement <16 x float> {{.*}}, i32 10
-  // CHECK: insertelement <16 x float> {{.*}}, i32 11
-  // CHECK: insertelement <16 x float> {{.*}}, i32 12
-  // CHECK: insertelement <16 x float> {{.*}}, i32 13
-  // CHECK: insertelement <16 x float> {{.*}}, i32 14
-  // CHECK: insertelement <16 x float> {{.*}}, i32 15
-  // CHECK: bitcast <16 x float>{{.*}} to <32 x half>
   return _mm512_set1_pch(h);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_set_ph(
+// CHECK-SAME: half noundef [[__H1:%.*]], half noundef [[__H2:%.*]], half noundef [[__H3:%.*]], half noundef [[__H4:%.*]], half noundef [[__H5:%.*]], half noundef [[__H6:%.*]], half noundef [[__H7:%.*]], half noundef [[__H8:%.*]], half noundef [[__H9:%.*]], half noundef [[__H10:%.*]], half noundef [[__H11:%.*]], half noundef [[__H12:%.*]], half noundef [[__H13:%.*]], half noundef [[__H14:%.*]], half noundef [[__H15:%.*]], half noundef [[__H16:%.*]], half noundef [[__H17:%.*]], half noundef [[__H18:%.*]], half noundef [[__H19:%.*]], half noundef [[__H20:%.*]], half noundef [[__H21:%.*]], half noundef [[__H22:%.*]], half noundef [[__H23:%.*]], half noundef [[__H24:%.*]], half noundef [[__H25:%.*]], half noundef [[__H26:%.*]], half noundef [[__H27:%.*]], half noundef [[__H28:%.*]], half noundef [[__H29:%.*]], half noundef [[__H30:%.*]], half noundef [[__H31:%.*]], half noundef [[__H32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__H1_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H2_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H3_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H4_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H5_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H6_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H7_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H8_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H9_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H10_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H11_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H12_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H13_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H14_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H15_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H16_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H17_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H18_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H19_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H20_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H21_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H22_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H23_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H24_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H25_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H26_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H27_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H28_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H29_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H30_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H31_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H32_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__H1_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H2_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H3_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H4_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H5_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H6_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H7_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H8_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H9_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H10_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H11_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H12_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H13_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H14_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H15_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H16_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H17_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H18_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H19_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H20_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H21_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H22_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H23_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H24_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H25_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H26_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H27_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H28_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H29_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H30_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H31_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H32_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    store half [[__H1]], ptr [[__H1_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H2]], ptr [[__H2_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H3]], ptr [[__H3_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H4]], ptr [[__H4_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H5]], ptr [[__H5_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H6]], ptr [[__H6_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H7]], ptr [[__H7_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H8]], ptr [[__H8_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H9]], ptr [[__H9_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H10]], ptr [[__H10_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H11]], ptr [[__H11_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H12]], ptr [[__H12_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H13]], ptr [[__H13_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H14]], ptr [[__H14_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H15]], ptr [[__H15_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H16]], ptr [[__H16_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H17]], ptr [[__H17_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H18]], ptr [[__H18_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H19]], ptr [[__H19_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H20]], ptr [[__H20_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H21]], ptr [[__H21_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H22]], ptr [[__H22_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H23]], ptr [[__H23_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H24]], ptr [[__H24_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H25]], ptr [[__H25_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H26]], ptr [[__H26_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H27]], ptr [[__H27_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H28]], ptr [[__H28_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H29]], ptr [[__H29_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H30]], ptr [[__H30_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H31]], ptr [[__H31_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H32]], ptr [[__H32_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[__H1_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__H2_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[__H3_ADDR]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__H4_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[__H5_ADDR]], align 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__H6_ADDR]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr [[__H7_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__H8_ADDR]], align 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load half, ptr [[__H9_ADDR]], align 2
+// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__H10_ADDR]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr [[__H11_ADDR]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__H12_ADDR]], align 2
+// CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr [[__H13_ADDR]], align 2
+// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__H14_ADDR]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = load half, ptr [[__H15_ADDR]], align 2
+// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__H16_ADDR]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr [[__H17_ADDR]], align 2
+// CHECK-NEXT:    [[TMP17:%.*]] = load half, ptr [[__H18_ADDR]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr [[__H19_ADDR]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr [[__H20_ADDR]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = load half, ptr [[__H21_ADDR]], align 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr [[__H22_ADDR]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr [[__H23_ADDR]], align 2
+// CHECK-NEXT:    [[TMP23:%.*]] = load half, ptr [[__H24_ADDR]], align 2
+// CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr [[__H25_ADDR]], align 2
+// CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr [[__H26_ADDR]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = load half, ptr [[__H27_ADDR]], align 2
+// CHECK-NEXT:    [[TMP27:%.*]] = load half, ptr [[__H28_ADDR]], align 2
+// CHECK-NEXT:    [[TMP28:%.*]] = load half, ptr [[__H29_ADDR]], align 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load half, ptr [[__H30_ADDR]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = load half, ptr [[__H31_ADDR]], align 2
+// CHECK-NEXT:    [[TMP31:%.*]] = load half, ptr [[__H32_ADDR]], align 2
+// CHECK-NEXT:    store half [[TMP0]], ptr [[__H1_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP1]], ptr [[__H2_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP2]], ptr [[__H3_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP3]], ptr [[__H4_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP4]], ptr [[__H5_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP5]], ptr [[__H6_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP6]], ptr [[__H7_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP7]], ptr [[__H8_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP8]], ptr [[__H9_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP9]], ptr [[__H10_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP10]], ptr [[__H11_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP11]], ptr [[__H12_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP12]], ptr [[__H13_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP13]], ptr [[__H14_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP14]], ptr [[__H15_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP15]], ptr [[__H16_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP16]], ptr [[__H17_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP17]], ptr [[__H18_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP18]], ptr [[__H19_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP19]], ptr [[__H20_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP20]], ptr [[__H21_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP21]], ptr [[__H22_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP22]], ptr [[__H23_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP23]], ptr [[__H24_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP24]], ptr [[__H25_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP25]], ptr [[__H26_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP26]], ptr [[__H27_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP27]], ptr [[__H28_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP28]], ptr [[__H29_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP29]], ptr [[__H30_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP30]], ptr [[__H31_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP31]], ptr [[__H32_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP32:%.*]] = load half, ptr [[__H32_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <32 x half> undef, half [[TMP32]], i32 0
+// CHECK-NEXT:    [[TMP33:%.*]] = load half, ptr [[__H31_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <32 x half> [[VECINIT_I]], half [[TMP33]], i32 1
+// CHECK-NEXT:    [[TMP34:%.*]] = load half, ptr [[__H30_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <32 x half> [[VECINIT1_I]], half [[TMP34]], i32 2
+// CHECK-NEXT:    [[TMP35:%.*]] = load half, ptr [[__H29_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <32 x half> [[VECINIT2_I]], half [[TMP35]], i32 3
+// CHECK-NEXT:    [[TMP36:%.*]] = load half, ptr [[__H28_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <32 x half> [[VECINIT3_I]], half [[TMP36]], i32 4
+// CHECK-NEXT:    [[TMP37:%.*]] = load half, ptr [[__H27_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <32 x half> [[VECINIT4_I]], half [[TMP37]], i32 5
+// CHECK-NEXT:    [[TMP38:%.*]] = load half, ptr [[__H26_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <32 x half> [[VECINIT5_I]], half [[TMP38]], i32 6
+// CHECK-NEXT:    [[TMP39:%.*]] = load half, ptr [[__H25_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <32 x half> [[VECINIT6_I]], half [[TMP39]], i32 7
+// CHECK-NEXT:    [[TMP40:%.*]] = load half, ptr [[__H24_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <32 x half> [[VECINIT7_I]], half [[TMP40]], i32 8
+// CHECK-NEXT:    [[TMP41:%.*]] = load half, ptr [[__H23_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <32 x half> [[VECINIT8_I]], half [[TMP41]], i32 9
+// CHECK-NEXT:    [[TMP42:%.*]] = load half, ptr [[__H22_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <32 x half> [[VECINIT9_I]], half [[TMP42]], i32 10
+// CHECK-NEXT:    [[TMP43:%.*]] = load half, ptr [[__H21_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <32 x half> [[VECINIT10_I]], half [[TMP43]], i32 11
+// CHECK-NEXT:    [[TMP44:%.*]] = load half, ptr [[__H20_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <32 x half> [[VECINIT11_I]], half [[TMP44]], i32 12
+// CHECK-NEXT:    [[TMP45:%.*]] = load half, ptr [[__H19_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <32 x half> [[VECINIT12_I]], half [[TMP45]], i32 13
+// CHECK-NEXT:    [[TMP46:%.*]] = load half, ptr [[__H18_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <32 x half> [[VECINIT13_I]], half [[TMP46]], i32 14
+// CHECK-NEXT:    [[TMP47:%.*]] = load half, ptr [[__H17_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <32 x half> [[VECINIT14_I]], half [[TMP47]], i32 15
+// CHECK-NEXT:    [[TMP48:%.*]] = load half, ptr [[__H16_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT16_I:%.*]] = insertelement <32 x half> [[VECINIT15_I]], half [[TMP48]], i32 16
+// CHECK-NEXT:    [[TMP49:%.*]] = load half, ptr [[__H15_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT17_I:%.*]] = insertelement <32 x half> [[VECINIT16_I]], half [[TMP49]], i32 17
+// CHECK-NEXT:    [[TMP50:%.*]] = load half, ptr [[__H14_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT18_I:%.*]] = insertelement <32 x half> [[VECINIT17_I]], half [[TMP50]], i32 18
+// CHECK-NEXT:    [[TMP51:%.*]] = load half, ptr [[__H13_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT19_I:%.*]] = insertelement <32 x half> [[VECINIT18_I]], half [[TMP51]], i32 19
+// CHECK-NEXT:    [[TMP52:%.*]] = load half, ptr [[__H12_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT20_I:%.*]] = insertelement <32 x half> [[VECINIT19_I]], half [[TMP52]], i32 20
+// CHECK-NEXT:    [[TMP53:%.*]] = load half, ptr [[__H11_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT21_I:%.*]] = insertelement <32 x half> [[VECINIT20_I]], half [[TMP53]], i32 21
+// CHECK-NEXT:    [[TMP54:%.*]] = load half, ptr [[__H10_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT22_I:%.*]] = insertelement <32 x half> [[VECINIT21_I]], half [[TMP54]], i32 22
+// CHECK-NEXT:    [[TMP55:%.*]] = load half, ptr [[__H9_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT23_I:%.*]] = insertelement <32 x half> [[VECINIT22_I]], half [[TMP55]], i32 23
+// CHECK-NEXT:    [[TMP56:%.*]] = load half, ptr [[__H8_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT24_I:%.*]] = insertelement <32 x half> [[VECINIT23_I]], half [[TMP56]], i32 24
+// CHECK-NEXT:    [[TMP57:%.*]] = load half, ptr [[__H7_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT25_I:%.*]] = insertelement <32 x half> [[VECINIT24_I]], half [[TMP57]], i32 25
+// CHECK-NEXT:    [[TMP58:%.*]] = load half, ptr [[__H6_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT26_I:%.*]] = insertelement <32 x half> [[VECINIT25_I]], half [[TMP58]], i32 26
+// CHECK-NEXT:    [[TMP59:%.*]] = load half, ptr [[__H5_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT27_I:%.*]] = insertelement <32 x half> [[VECINIT26_I]], half [[TMP59]], i32 27
+// CHECK-NEXT:    [[TMP60:%.*]] = load half, ptr [[__H4_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT28_I:%.*]] = insertelement <32 x half> [[VECINIT27_I]], half [[TMP60]], i32 28
+// CHECK-NEXT:    [[TMP61:%.*]] = load half, ptr [[__H3_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT29_I:%.*]] = insertelement <32 x half> [[VECINIT28_I]], half [[TMP61]], i32 29
+// CHECK-NEXT:    [[TMP62:%.*]] = load half, ptr [[__H2_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT30_I:%.*]] = insertelement <32 x half> [[VECINIT29_I]], half [[TMP62]], i32 30
+// CHECK-NEXT:    [[TMP63:%.*]] = load half, ptr [[__H1_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT31_I:%.*]] = insertelement <32 x half> [[VECINIT30_I]], half [[TMP63]], i32 31
+// CHECK-NEXT:    store <32 x half> [[VECINIT31_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP64:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    ret <32 x half> [[TMP64]]
+//
 __m512h test_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
                           _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
                           _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
@@ -111,45 +465,244 @@ __m512h test_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16
                           _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
                           _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
                           _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
-  // CHECK-LABEL: @test_mm512_set_ph
-  // CHECK: insertelement <32 x half> {{.*}}, i32 0
-  // CHECK: insertelement <32 x half> {{.*}}, i32 1
-  // CHECK: insertelement <32 x half> {{.*}}, i32 2
-  // CHECK: insertelement <32 x half> {{.*}}, i32 3
-  // CHECK: insertelement <32 x half> {{.*}}, i32 4
-  // CHECK: insertelement <32 x half> {{.*}}, i32 5
-  // CHECK: insertelement <32 x half> {{.*}}, i32 6
-  // CHECK: insertelement <32 x half> {{.*}}, i32 7
-  // CHECK: insertelement <32 x half> {{.*}}, i32 8
-  // CHECK: insertelement <32 x half> {{.*}}, i32 9
-  // CHECK: insertelement <32 x half> {{.*}}, i32 10
-  // CHECK: insertelement <32 x half> {{.*}}, i32 11
-  // CHECK: insertelement <32 x half> {{.*}}, i32 12
-  // CHECK: insertelement <32 x half> {{.*}}, i32 13
-  // CHECK: insertelement <32 x half> {{.*}}, i32 14
-  // CHECK: insertelement <32 x half> {{.*}}, i32 15
-  // CHECK: insertelement <32 x half> {{.*}}, i32 16
-  // CHECK: insertelement <32 x half> {{.*}}, i32 17
-  // CHECK: insertelement <32 x half> {{.*}}, i32 18
-  // CHECK: insertelement <32 x half> {{.*}}, i32 19
-  // CHECK: insertelement <32 x half> {{.*}}, i32 20
-  // CHECK: insertelement <32 x half> {{.*}}, i32 21
-  // CHECK: insertelement <32 x half> {{.*}}, i32 22
-  // CHECK: insertelement <32 x half> {{.*}}, i32 23
-  // CHECK: insertelement <32 x half> {{.*}}, i32 24
-  // CHECK: insertelement <32 x half> {{.*}}, i32 25
-  // CHECK: insertelement <32 x half> {{.*}}, i32 26
-  // CHECK: insertelement <32 x half> {{.*}}, i32 27
-  // CHECK: insertelement <32 x half> {{.*}}, i32 28
-  // CHECK: insertelement <32 x half> {{.*}}, i32 29
-  // CHECK: insertelement <32 x half> {{.*}}, i32 30
-  // CHECK: insertelement <32 x half> {{.*}}, i32 31
   return _mm512_set_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8,
                        __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16,
                        __h17, __h18, __h19, __h20, __h21, __h22, __h23, __h24,
                        __h25, __h26, __h27, __h28, __h29, __h30, __h31, __h32);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_setr_ph(
+// CHECK-SAME: half noundef [[__H1:%.*]], half noundef [[__H2:%.*]], half noundef [[__H3:%.*]], half noundef [[__H4:%.*]], half noundef [[__H5:%.*]], half noundef [[__H6:%.*]], half noundef [[__H7:%.*]], half noundef [[__H8:%.*]], half noundef [[__H9:%.*]], half noundef [[__H10:%.*]], half noundef [[__H11:%.*]], half noundef [[__H12:%.*]], half noundef [[__H13:%.*]], half noundef [[__H14:%.*]], half noundef [[__H15:%.*]], half noundef [[__H16:%.*]], half noundef [[__H17:%.*]], half noundef [[__H18:%.*]], half noundef [[__H19:%.*]], half noundef [[__H20:%.*]], half noundef [[__H21:%.*]], half noundef [[__H22:%.*]], half noundef [[__H23:%.*]], half noundef [[__H24:%.*]], half noundef [[__H25:%.*]], half noundef [[__H26:%.*]], half noundef [[__H27:%.*]], half noundef [[__H28:%.*]], half noundef [[__H29:%.*]], half noundef [[__H30:%.*]], half noundef [[__H31:%.*]], half noundef [[__H32:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__H1_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H2_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H3_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H4_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H5_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H6_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H7_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H8_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H9_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H10_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H11_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H12_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H13_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H14_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H15_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H16_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H17_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H18_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H19_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H20_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H21_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H22_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H23_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H24_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H25_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H26_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H27_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H28_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H29_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H30_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H31_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H32_ADDR_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__H1_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H2_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H3_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H4_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H5_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H6_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H7_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H8_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H9_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H10_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H11_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H12_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H13_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H14_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H15_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H16_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H17_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H18_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H19_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H20_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H21_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H22_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H23_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H24_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H25_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H26_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H27_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H28_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H29_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H30_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H31_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[__H32_ADDR:%.*]] = alloca half, align 2
+// CHECK-NEXT:    store half [[__H1]], ptr [[__H1_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H2]], ptr [[__H2_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H3]], ptr [[__H3_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H4]], ptr [[__H4_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H5]], ptr [[__H5_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H6]], ptr [[__H6_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H7]], ptr [[__H7_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H8]], ptr [[__H8_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H9]], ptr [[__H9_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H10]], ptr [[__H10_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H11]], ptr [[__H11_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H12]], ptr [[__H12_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H13]], ptr [[__H13_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H14]], ptr [[__H14_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H15]], ptr [[__H15_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H16]], ptr [[__H16_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H17]], ptr [[__H17_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H18]], ptr [[__H18_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H19]], ptr [[__H19_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H20]], ptr [[__H20_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H21]], ptr [[__H21_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H22]], ptr [[__H22_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H23]], ptr [[__H23_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H24]], ptr [[__H24_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H25]], ptr [[__H25_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H26]], ptr [[__H26_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H27]], ptr [[__H27_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H28]], ptr [[__H28_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H29]], ptr [[__H29_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H30]], ptr [[__H30_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H31]], ptr [[__H31_ADDR]], align 2
+// CHECK-NEXT:    store half [[__H32]], ptr [[__H32_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[__H32_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[__H31_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[__H30_ADDR]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__H29_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[__H28_ADDR]], align 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load half, ptr [[__H27_ADDR]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load half, ptr [[__H26_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load half, ptr [[__H25_ADDR]], align 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load half, ptr [[__H24_ADDR]], align 2
+// CHECK-NEXT:    [[TMP9:%.*]] = load half, ptr [[__H23_ADDR]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = load half, ptr [[__H22_ADDR]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = load half, ptr [[__H21_ADDR]], align 2
+// CHECK-NEXT:    [[TMP12:%.*]] = load half, ptr [[__H20_ADDR]], align 2
+// CHECK-NEXT:    [[TMP13:%.*]] = load half, ptr [[__H19_ADDR]], align 2
+// CHECK-NEXT:    [[TMP14:%.*]] = load half, ptr [[__H18_ADDR]], align 2
+// CHECK-NEXT:    [[TMP15:%.*]] = load half, ptr [[__H17_ADDR]], align 2
+// CHECK-NEXT:    [[TMP16:%.*]] = load half, ptr [[__H16_ADDR]], align 2
+// CHECK-NEXT:    [[TMP17:%.*]] = load half, ptr [[__H15_ADDR]], align 2
+// CHECK-NEXT:    [[TMP18:%.*]] = load half, ptr [[__H14_ADDR]], align 2
+// CHECK-NEXT:    [[TMP19:%.*]] = load half, ptr [[__H13_ADDR]], align 2
+// CHECK-NEXT:    [[TMP20:%.*]] = load half, ptr [[__H12_ADDR]], align 2
+// CHECK-NEXT:    [[TMP21:%.*]] = load half, ptr [[__H11_ADDR]], align 2
+// CHECK-NEXT:    [[TMP22:%.*]] = load half, ptr [[__H10_ADDR]], align 2
+// CHECK-NEXT:    [[TMP23:%.*]] = load half, ptr [[__H9_ADDR]], align 2
+// CHECK-NEXT:    [[TMP24:%.*]] = load half, ptr [[__H8_ADDR]], align 2
+// CHECK-NEXT:    [[TMP25:%.*]] = load half, ptr [[__H7_ADDR]], align 2
+// CHECK-NEXT:    [[TMP26:%.*]] = load half, ptr [[__H6_ADDR]], align 2
+// CHECK-NEXT:    [[TMP27:%.*]] = load half, ptr [[__H5_ADDR]], align 2
+// CHECK-NEXT:    [[TMP28:%.*]] = load half, ptr [[__H4_ADDR]], align 2
+// CHECK-NEXT:    [[TMP29:%.*]] = load half, ptr [[__H3_ADDR]], align 2
+// CHECK-NEXT:    [[TMP30:%.*]] = load half, ptr [[__H2_ADDR]], align 2
+// CHECK-NEXT:    [[TMP31:%.*]] = load half, ptr [[__H1_ADDR]], align 2
+// CHECK-NEXT:    store half [[TMP0]], ptr [[__H1_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP1]], ptr [[__H2_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP2]], ptr [[__H3_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP3]], ptr [[__H4_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP4]], ptr [[__H5_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP5]], ptr [[__H6_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP6]], ptr [[__H7_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP7]], ptr [[__H8_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP8]], ptr [[__H9_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP9]], ptr [[__H10_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP10]], ptr [[__H11_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP11]], ptr [[__H12_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP12]], ptr [[__H13_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP13]], ptr [[__H14_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP14]], ptr [[__H15_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP15]], ptr [[__H16_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP16]], ptr [[__H17_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP17]], ptr [[__H18_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP18]], ptr [[__H19_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP19]], ptr [[__H20_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP20]], ptr [[__H21_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP21]], ptr [[__H22_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP22]], ptr [[__H23_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP23]], ptr [[__H24_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP24]], ptr [[__H25_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP25]], ptr [[__H26_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP26]], ptr [[__H27_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP27]], ptr [[__H28_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP28]], ptr [[__H29_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP29]], ptr [[__H30_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP30]], ptr [[__H31_ADDR_I]], align 2
+// CHECK-NEXT:    store half [[TMP31]], ptr [[__H32_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP32:%.*]] = load half, ptr [[__H32_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <32 x half> undef, half [[TMP32]], i32 0
+// CHECK-NEXT:    [[TMP33:%.*]] = load half, ptr [[__H31_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <32 x half> [[VECINIT_I]], half [[TMP33]], i32 1
+// CHECK-NEXT:    [[TMP34:%.*]] = load half, ptr [[__H30_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <32 x half> [[VECINIT1_I]], half [[TMP34]], i32 2
+// CHECK-NEXT:    [[TMP35:%.*]] = load half, ptr [[__H29_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <32 x half> [[VECINIT2_I]], half [[TMP35]], i32 3
+// CHECK-NEXT:    [[TMP36:%.*]] = load half, ptr [[__H28_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <32 x half> [[VECINIT3_I]], half [[TMP36]], i32 4
+// CHECK-NEXT:    [[TMP37:%.*]] = load half, ptr [[__H27_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <32 x half> [[VECINIT4_I]], half [[TMP37]], i32 5
+// CHECK-NEXT:    [[TMP38:%.*]] = load half, ptr [[__H26_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <32 x half> [[VECINIT5_I]], half [[TMP38]], i32 6
+// CHECK-NEXT:    [[TMP39:%.*]] = load half, ptr [[__H25_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <32 x half> [[VECINIT6_I]], half [[TMP39]], i32 7
+// CHECK-NEXT:    [[TMP40:%.*]] = load half, ptr [[__H24_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <32 x half> [[VECINIT7_I]], half [[TMP40]], i32 8
+// CHECK-NEXT:    [[TMP41:%.*]] = load half, ptr [[__H23_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <32 x half> [[VECINIT8_I]], half [[TMP41]], i32 9
+// CHECK-NEXT:    [[TMP42:%.*]] = load half, ptr [[__H22_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <32 x half> [[VECINIT9_I]], half [[TMP42]], i32 10
+// CHECK-NEXT:    [[TMP43:%.*]] = load half, ptr [[__H21_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <32 x half> [[VECINIT10_I]], half [[TMP43]], i32 11
+// CHECK-NEXT:    [[TMP44:%.*]] = load half, ptr [[__H20_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <32 x half> [[VECINIT11_I]], half [[TMP44]], i32 12
+// CHECK-NEXT:    [[TMP45:%.*]] = load half, ptr [[__H19_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <32 x half> [[VECINIT12_I]], half [[TMP45]], i32 13
+// CHECK-NEXT:    [[TMP46:%.*]] = load half, ptr [[__H18_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <32 x half> [[VECINIT13_I]], half [[TMP46]], i32 14
+// CHECK-NEXT:    [[TMP47:%.*]] = load half, ptr [[__H17_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <32 x half> [[VECINIT14_I]], half [[TMP47]], i32 15
+// CHECK-NEXT:    [[TMP48:%.*]] = load half, ptr [[__H16_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT16_I:%.*]] = insertelement <32 x half> [[VECINIT15_I]], half [[TMP48]], i32 16
+// CHECK-NEXT:    [[TMP49:%.*]] = load half, ptr [[__H15_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT17_I:%.*]] = insertelement <32 x half> [[VECINIT16_I]], half [[TMP49]], i32 17
+// CHECK-NEXT:    [[TMP50:%.*]] = load half, ptr [[__H14_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT18_I:%.*]] = insertelement <32 x half> [[VECINIT17_I]], half [[TMP50]], i32 18
+// CHECK-NEXT:    [[TMP51:%.*]] = load half, ptr [[__H13_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT19_I:%.*]] = insertelement <32 x half> [[VECINIT18_I]], half [[TMP51]], i32 19
+// CHECK-NEXT:    [[TMP52:%.*]] = load half, ptr [[__H12_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT20_I:%.*]] = insertelement <32 x half> [[VECINIT19_I]], half [[TMP52]], i32 20
+// CHECK-NEXT:    [[TMP53:%.*]] = load half, ptr [[__H11_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT21_I:%.*]] = insertelement <32 x half> [[VECINIT20_I]], half [[TMP53]], i32 21
+// CHECK-NEXT:    [[TMP54:%.*]] = load half, ptr [[__H10_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT22_I:%.*]] = insertelement <32 x half> [[VECINIT21_I]], half [[TMP54]], i32 22
+// CHECK-NEXT:    [[TMP55:%.*]] = load half, ptr [[__H9_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT23_I:%.*]] = insertelement <32 x half> [[VECINIT22_I]], half [[TMP55]], i32 23
+// CHECK-NEXT:    [[TMP56:%.*]] = load half, ptr [[__H8_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT24_I:%.*]] = insertelement <32 x half> [[VECINIT23_I]], half [[TMP56]], i32 24
+// CHECK-NEXT:    [[TMP57:%.*]] = load half, ptr [[__H7_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT25_I:%.*]] = insertelement <32 x half> [[VECINIT24_I]], half [[TMP57]], i32 25
+// CHECK-NEXT:    [[TMP58:%.*]] = load half, ptr [[__H6_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT26_I:%.*]] = insertelement <32 x half> [[VECINIT25_I]], half [[TMP58]], i32 26
+// CHECK-NEXT:    [[TMP59:%.*]] = load half, ptr [[__H5_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT27_I:%.*]] = insertelement <32 x half> [[VECINIT26_I]], half [[TMP59]], i32 27
+// CHECK-NEXT:    [[TMP60:%.*]] = load half, ptr [[__H4_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT28_I:%.*]] = insertelement <32 x half> [[VECINIT27_I]], half [[TMP60]], i32 28
+// CHECK-NEXT:    [[TMP61:%.*]] = load half, ptr [[__H3_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT29_I:%.*]] = insertelement <32 x half> [[VECINIT28_I]], half [[TMP61]], i32 29
+// CHECK-NEXT:    [[TMP62:%.*]] = load half, ptr [[__H2_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT30_I:%.*]] = insertelement <32 x half> [[VECINIT29_I]], half [[TMP62]], i32 30
+// CHECK-NEXT:    [[TMP63:%.*]] = load half, ptr [[__H1_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT31_I:%.*]] = insertelement <32 x half> [[VECINIT30_I]], half [[TMP63]], i32 31
+// CHECK-NEXT:    store <32 x half> [[VECINIT31_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP64:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    ret <32 x half> [[TMP64]]
+//
 __m512h test_mm512_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
                            _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
                            _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
@@ -158,4477 +711,15844 @@ __m512h test_mm512_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16
                            _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
                            _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
                            _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
-  // CHECK-LABEL: @test_mm512_setr_ph
-  // CHECK: insertelement <32 x half> {{.*}}, i32 0
-  // CHECK: insertelement <32 x half> {{.*}}, i32 1
-  // CHECK: insertelement <32 x half> {{.*}}, i32 2
-  // CHECK: insertelement <32 x half> {{.*}}, i32 3
-  // CHECK: insertelement <32 x half> {{.*}}, i32 4
-  // CHECK: insertelement <32 x half> {{.*}}, i32 5
-  // CHECK: insertelement <32 x half> {{.*}}, i32 6
-  // CHECK: insertelement <32 x half> {{.*}}, i32 7
-  // CHECK: insertelement <32 x half> {{.*}}, i32 8
-  // CHECK: insertelement <32 x half> {{.*}}, i32 9
-  // CHECK: insertelement <32 x half> {{.*}}, i32 10
-  // CHECK: insertelement <32 x half> {{.*}}, i32 11
-  // CHECK: insertelement <32 x half> {{.*}}, i32 12
-  // CHECK: insertelement <32 x half> {{.*}}, i32 13
-  // CHECK: insertelement <32 x half> {{.*}}, i32 14
-  // CHECK: insertelement <32 x half> {{.*}}, i32 15
-  // CHECK: insertelement <32 x half> {{.*}}, i32 16
-  // CHECK: insertelement <32 x half> {{.*}}, i32 17
-  // CHECK: insertelement <32 x half> {{.*}}, i32 18
-  // CHECK: insertelement <32 x half> {{.*}}, i32 19
-  // CHECK: insertelement <32 x half> {{.*}}, i32 20
-  // CHECK: insertelement <32 x half> {{.*}}, i32 21
-  // CHECK: insertelement <32 x half> {{.*}}, i32 22
-  // CHECK: insertelement <32 x half> {{.*}}, i32 23
-  // CHECK: insertelement <32 x half> {{.*}}, i32 24
-  // CHECK: insertelement <32 x half> {{.*}}, i32 25
-  // CHECK: insertelement <32 x half> {{.*}}, i32 26
-  // CHECK: insertelement <32 x half> {{.*}}, i32 27
-  // CHECK: insertelement <32 x half> {{.*}}, i32 28
-  // CHECK: insertelement <32 x half> {{.*}}, i32 29
-  // CHECK: insertelement <32 x half> {{.*}}, i32 30
-  // CHECK: insertelement <32 x half> {{.*}}, i32 31
   return _mm512_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8,
                         __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16,
                         __h17, __h18, __h19, __h20, __h21, __h22, __h23, __h24,
                         __h25, __h26, __h27, __h28, __h29, __h30, __h31, __h32);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_castph_ps(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
 __m128 test_mm_castph_ps(__m128h A) {
-  // CHECK-LABEL: test_mm_castph_ps
-  // CHECK: bitcast <8 x half> %{{.*}} to <4 x float>
   return _mm_castph_ps(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x float> @test_mm256_castph_ps(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x half> [[TMP1]] to <8 x float>
+// CHECK-NEXT:    ret <8 x float> [[TMP2]]
+//
 __m256 test_mm256_castph_ps(__m256h A) {
-  // CHECK-LABEL: test_mm256_castph_ps
-  // CHECK: bitcast <16 x half> %{{.*}} to <8 x float>
   return _mm256_castph_ps(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x float> @test_mm512_castph_ps(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x half> [[TMP1]] to <16 x float>
+// CHECK-NEXT:    ret <16 x float> [[TMP2]]
+//
 __m512 test_mm512_castph_ps(__m512h A) {
-  // CHECK-LABEL: test_mm512_castph_ps
-  // CHECK: bitcast <32 x half> %{{.*}} to <16 x float>
   return _mm512_castph_ps(A);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_castph_pd(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP1]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP2]]
+//
 __m128d test_mm_castph_pd(__m128h A) {
-  // CHECK-LABEL: test_mm_castph_pd
-  // CHECK: bitcast <8 x half> %{{.*}} to <2 x double>
   return _mm_castph_pd(A);
 }
 
+// CHECK-LABEL: define dso_local <4 x double> @test_mm256_castph_pd(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x half> [[TMP1]] to <4 x double>
+// CHECK-NEXT:    ret <4 x double> [[TMP2]]
+//
 __m256d test_mm256_castph_pd(__m256h A) {
-  // CHECK-LABEL: test_mm256_castph_pd
-  // CHECK: bitcast <16 x half> %{{.*}} to <4 x double>
   return _mm256_castph_pd(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x double> @test_mm512_castph_pd(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x half> [[TMP1]] to <8 x double>
+// CHECK-NEXT:    ret <8 x double> [[TMP2]]
+//
 __m512d test_mm512_castph_pd(__m512h A) {
-  // CHECK-LABEL: test_mm512_castph_pd
-  // CHECK: bitcast <32 x half> %{{.*}} to <8 x double>
   return _mm512_castph_pd(A);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_castph_si128(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_castph_si128(__m128h A) {
-  // CHECK-LABEL: test_mm_castph_si128
-  // CHECK: bitcast <8 x half> %{{.*}} to <2 x i64>
   return _mm_castph_si128(A);
 }
 
+// CHECK-LABEL: define dso_local <4 x i64> @test_mm256_castph_si256(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x half> [[TMP1]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP2]]
+//
 __m256i test_mm256_castph_si256(__m256h A) {
-  // CHECK-LABEL: test_mm256_castph_si256
-  // CHECK: bitcast <16 x half> %{{.*}} to <4 x i64>
   return _mm256_castph_si256(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_castph_si512(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x half> [[TMP1]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+//
 __m512i test_mm512_castph_si512(__m512h A) {
-  // CHECK-LABEL: test_mm512_castph_si512
-  // CHECK: bitcast <32 x half> %{{.*}} to <8 x i64>
   return _mm512_castph_si512(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_castps_ph(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_castps_ph(__m128 A) {
-  // CHECK-LABEL: test_mm_castps_ph
-  // CHECK: bitcast <4 x float> %{{.*}} to <8 x half>
   return _mm_castps_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_castps_ph(
+// CHECK-SAME: <8 x float> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x float>, align 32
+// CHECK-NEXT:    store <8 x float> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <8 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x float> [[TMP1]] to <16 x half>
+// CHECK-NEXT:    ret <16 x half> [[TMP2]]
+//
 __m256h test_mm256_castps_ph(__m256 A) {
-  // CHECK-LABEL: test_mm256_castps_ph
-  // CHECK: bitcast <8 x float> %{{.*}} to <16 x half>
   return _mm256_castps_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_castps_ph(
+// CHECK-SAME: <16 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x float> [[TMP1]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_castps_ph(__m512 A) {
-  // CHECK-LABEL: test_mm512_castps_ph
-  // CHECK: bitcast <16 x float> %{{.*}} to <32 x half>
   return _mm512_castps_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_castpd_ph(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_castpd_ph(__m128d A) {
-  // CHECK-LABEL: test_mm_castpd_ph
-  // CHECK: bitcast <2 x double> %{{.*}} to <8 x half>
   return _mm_castpd_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_castpd_ph(
+// CHECK-SAME: <4 x double> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x double>, align 32
+// CHECK-NEXT:    store <4 x double> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <4 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x double> [[TMP1]] to <16 x half>
+// CHECK-NEXT:    ret <16 x half> [[TMP2]]
+//
 __m256h test_mm256_castpd_ph(__m256d A) {
-  // CHECK-LABEL: test_mm256_castpd_ph
-  // CHECK: bitcast <4 x double> %{{.*}} to <16 x half>
   return _mm256_castpd_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_castpd_ph(
+// CHECK-SAME: <8 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store <8 x double> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x double> [[TMP1]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_castpd_ph(__m512d A) {
-  // CHECK-LABEL: test_mm512_castpd_ph
-  // CHECK: bitcast <8 x double> %{{.*}} to <32 x half>
   return _mm512_castpd_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_castsi128_ph(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_castsi128_ph(__m128i A) {
-  // CHECK-LABEL: test_mm_castsi128_ph
-  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x half>
   return _mm_castsi128_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_castsi256_ph(
+// CHECK-SAME: <4 x i64> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x i64>, align 32
+// CHECK-NEXT:    store <4 x i64> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <4 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x half>
+// CHECK-NEXT:    ret <16 x half> [[TMP2]]
+//
 __m256h test_mm256_castsi256_ph(__m256i A) {
-  // CHECK-LABEL: test_mm256_castsi256_ph
-  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x half>
   return _mm256_castsi256_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_castsi512_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_castsi512_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_castsi512_ph
-  // CHECK: bitcast <8 x i64> %{{.*}} to <32 x half>
   return _mm512_castsi512_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm256_castph256_ph128(
+// CHECK-SAME: <16 x half> noundef [[__A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[__A]], ptr [[__A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[__A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x half> [[TMP1]], <16 x half> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 __m128h test_mm256_castph256_ph128(__m256h __a) {
-  // CHECK-LABEL: test_mm256_castph256_ph128
-  // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm256_castph256_ph128(__a);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_castph512_ph128(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <32 x half> [[TMP1]], <32 x half> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 __m128h test_mm512_castph512_ph128(__m512h __a) {
-  // CHECK-LABEL: test_mm512_castph512_ph128
-  // CHECK: shufflevector <32 x half> %{{.*}}, <32 x half> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return _mm512_castph512_ph128(__a);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_castph512_ph256(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <32 x half> [[TMP1]], <32 x half> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x half> [[SHUFFLE_I]]
+//
 __m256h test_mm512_castph512_ph256(__m512h __a) {
-  // CHECK-LABEL: test_mm512_castph512_ph256
-  // CHECK: shufflevector <32 x half> %{{.*}}, <32 x half> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_castph512_ph256(__a);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_castph128_ph256(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:    ret <16 x half> [[SHUFFLE_I]]
+//
 __m256h test_mm256_castph128_ph256(__m128h __a) {
-  // CHECK-LABEL: test_mm256_castph128_ph256
-  // CHECK: [[A:%.*]] = freeze <8 x half> poison 
-  // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm256_castph128_ph256(__a);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_castph128_ph512(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:    ret <32 x half> [[SHUFFLE_I]]
+//
 __m512h test_mm512_castph128_ph512(__m128h __a) {
-  // CHECK-LABEL: test_mm512_castph128_ph512
-  // CHECK: [[B:%.*]] = freeze <16 x half> poison
-  // CHECK: store <16 x half> [[B]], ptr [[BA:%.*]]
-  // CHECK: [[A:%.*]] = freeze <8 x half> poison
-  // CHECK: [[SV:%.*]] = shufflevector <8 x half> %{{.*}}, <8 x half> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  // CHECK: [[C:%.*]] = load <16 x half>, ptr [[BA]]
-  // CHECK: shufflevector <16 x half> [[SV]], <16 x half> [[C]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   return _mm512_castph128_ph512(__a);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_castph256_ph512(
+// CHECK-SAME: <16 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[__A]], ptr [[__A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[__A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x half> [[TMP1]], <16 x half> [[TMP2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+// CHECK-NEXT:    ret <32 x half> [[SHUFFLE_I]]
+//
 __m512h test_mm512_castph256_ph512(__m256h __a) {
-  // CHECK-LABEL: test_mm512_castph256_ph512
-  // CHECK: [[A:%.*]] = freeze <16 x half> poison 
-  // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> [[A]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   return _mm512_castph256_ph512(__a);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_zextph128_ph256(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <16 x half> [[SHUFFLE_I]]
+//
 __m256h test_mm256_zextph128_ph256(__m128h __a) {
-  // CHECK-LABEL: test_mm256_zextph128_ph256
-  // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> {{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm256_zextph128_ph256(__a);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_zextph128_ph512(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    ret <32 x half> [[SHUFFLE_I]]
+//
 __m512h test_mm512_zextph128_ph512(__m128h __a) {
-  // CHECK-LABEL: test_mm512_zextph128_ph512
-  // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> {{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return _mm512_zextph128_ph512(__a);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_zextph256_ph512(
+// CHECK-SAME: <16 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[__A]], ptr [[__A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[__A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x half> [[TMP1]], <16 x half> [[TMP2]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+// CHECK-NEXT:    ret <32 x half> [[SHUFFLE_I]]
+//
 __m512h test_mm512_zextph256_ph512(__m256h __a) {
-  // CHECK-LABEL: test_mm512_zextph256_ph512
-  // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> {{.*}}, <32 x i32>
   return _mm512_zextph256_ph512(__a);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comi_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 0, i32 8)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm_comi_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comi_round_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 8)
   return _mm_comi_round_sh(__A, __B, 0, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comi_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 0, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm_comi_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comi_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4)
   return _mm_comi_sh(__A, __B, 0);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comieq_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 16, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comieq_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comieq_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 16, i32 4)
   return _mm_comieq_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comilt_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 1, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comilt_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comilt_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 1, i32 4)
   return _mm_comilt_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comile_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 2, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comile_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comile_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 2, i32 4)
   return _mm_comile_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comigt_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 14, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comigt_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comigt_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 14, i32 4)
   return _mm_comigt_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comige_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 13, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comige_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comige_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 13, i32 4)
   return _mm_comige_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_comineq_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 20, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comineq_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_comineq_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 20, i32 4)
   return _mm_comineq_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_ucomieq_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 0, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomieq_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_ucomieq_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4)
   return _mm_ucomieq_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_ucomilt_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 17, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomilt_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_ucomilt_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 17, i32 4)
   return _mm_ucomilt_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_ucomile_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 18, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomile_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_ucomile_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 18, i32 4)
   return _mm_ucomile_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_ucomigt_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 30, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomigt_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_ucomigt_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 30, i32 4)
   return _mm_ucomigt_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_ucomige_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 29, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomige_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_ucomige_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 29, i32 4)
   return _mm_ucomige_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_ucomineq_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], i32 4, i32 4)
+// CHECK-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomineq_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: test_mm_ucomineq_sh
-  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 4, i32 4)
   return _mm_ucomineq_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_add_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <32 x half> [[TMP2]], [[TMP3]]
+// CHECK-NEXT:    ret <32 x half> [[ADD_I]]
+//
 __m512h test_mm512_add_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_add_ph
-  // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}}
   return _mm512_add_ph(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_add_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP6]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = fadd <32 x half> [[TMP7]], [[TMP8]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[ADD_I_I]], <32 x half> [[TMP9]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_add_ph
-  // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return (__m512h)_mm512_mask_add_ph(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_add_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = fadd <32 x half> [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[ADD_I_I]], <32 x half> [[TMP8]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_add_ph
-  // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_add_ph(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_add_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_add_round_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_add_round_ph
-  // CHECK: @llvm.x86.avx512fp16.add.ph.512
   return _mm512_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_add_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_add_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_add_round_ph
-  // CHECK: @llvm.x86.avx512fp16.add.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_add_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_add_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_add_round_ph
-  // CHECK: @llvm.x86.avx512fp16.add.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_sub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <32 x half> [[TMP2]], [[TMP3]]
+// CHECK-NEXT:    ret <32 x half> [[SUB_I]]
+//
 __m512h test_mm512_sub_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_sub_ph
-  // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}}
   return _mm512_sub_ph(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_sub_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP6]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = fsub <32 x half> [[TMP7]], [[TMP8]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[SUB_I_I]], <32 x half> [[TMP9]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_sub_ph
-  // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return (__m512h)_mm512_mask_sub_ph(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_sub_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = fsub <32 x half> [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[SUB_I_I]], <32 x half> [[TMP8]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sub_ph
-  // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_sub_ph(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_sub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_sub_round_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_sub_round_ph
-  // CHECK: @llvm.x86.avx512fp16.sub.ph.512
   return _mm512_sub_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_sub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_sub_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_sub_round_ph
-  // CHECK: @llvm.x86.avx512fp16.sub.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_sub_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_sub_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_sub_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_sub_round_ph
-  // CHECK: @llvm.x86.avx512fp16.sub.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_sub_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mul_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <32 x half> [[TMP2]], [[TMP3]]
+// CHECK-NEXT:    ret <32 x half> [[MUL_I]]
+//
 __m512h test_mm512_mul_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mul_ph
-  // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}}
   return _mm512_mul_ph(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_mul_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP6]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[MUL_I_I:%.*]] = fmul <32 x half> [[TMP7]], [[TMP8]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[MUL_I_I]], <32 x half> [[TMP9]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_ph
-  // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return (__m512h)_mm512_mask_mul_ph(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_mul_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[MUL_I_I:%.*]] = fmul <32 x half> [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[MUL_I_I]], <32 x half> [[TMP8]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_ph
-  // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_mul_ph(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mul_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_mul_round_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mul_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mul.ph.512
   return _mm512_mul_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_mul_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_mul_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mul.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_mul_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_mul_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_mul_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mul.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_mul_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_div_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <32 x half> [[TMP2]], [[TMP3]]
+// CHECK-NEXT:    ret <32 x half> [[DIV_I]]
+//
 __m512h test_mm512_div_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_div_ph
-  // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}}
   return _mm512_div_ph(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_div_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP6]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[DIV_I_I:%.*]] = fdiv <32 x half> [[TMP7]], [[TMP8]]
+// CHECK-NEXT:    [[TMP9:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[DIV_I_I]], <32 x half> [[TMP9]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_div_ph
-  // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return (__m512h)_mm512_mask_div_ph(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_div_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[DIV_I_I:%.*]] = fdiv <32 x half> [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[DIV_I_I]], <32 x half> [[TMP8]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_div_ph
-  // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_div_ph(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_div_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_div_round_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_div_round_ph
-  // CHECK: @llvm.x86.avx512fp16.div.ph.512
   return _mm512_div_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_div_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_div_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_div_round_ph
-  // CHECK: @llvm.x86.avx512fp16.div.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_div_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_div_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_div_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_div_round_ph
-  // CHECK: @llvm.x86.avx512fp16.div.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_div_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_min_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> [[TMP2]], <32 x half> [[TMP3]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP4]]
+//
 __m512h test_mm512_min_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_min_ph
-  // CHECK: @llvm.x86.avx512fp16.min.ph.512
   return _mm512_min_ph(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_min_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP6]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> [[TMP7]], <32 x half> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
+// CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x half> [[TMP9]], <32 x half> [[TMP10]]
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_min_ph
-  // CHECK: @llvm.x86.avx512fp16.min.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return (__m512h)_mm512_mask_min_ph(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_min_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> [[TMP6]], <32 x half> [[TMP7]], i32 4)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[TMP8]], <32 x half> [[TMP9]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_min_ph
-  // CHECK: @llvm.x86.avx512fp16.min.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_min_ph(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_min_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_min_round_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_min_round_ph
-  // CHECK: @llvm.x86.avx512fp16.min.ph.512
   return _mm512_min_round_ph(__A, __B, _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_min_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_min_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_min_round_ph
-  // CHECK: @llvm.x86.avx512fp16.min.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_min_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_min_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 8)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_min_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_min_round_ph
-  // CHECK: @llvm.x86.avx512fp16.min.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_min_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_max_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> [[TMP2]], <32 x half> [[TMP3]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP4]]
+//
 __m512h test_mm512_max_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_max_ph
-  // CHECK: @llvm.x86.avx512fp16.max.ph.512
 
   return _mm512_max_ph(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_max_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP6]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> [[TMP7]], <32 x half> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP4]] to <32 x i1>
+// CHECK-NEXT:    [[TMP12:%.*]] = select <32 x i1> [[TMP11]], <32 x half> [[TMP9]], <32 x half> [[TMP10]]
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_max_ph
-  // CHECK: @llvm.x86.avx512fp16.max.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return (__m512h)_mm512_mask_max_ph(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_max_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP5]], ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__B_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> [[TMP6]], <32 x half> [[TMP7]], i32 4)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[TMP8]], <32 x half> [[TMP9]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_max_ph
-  // CHECK: @llvm.x86.avx512fp16.max.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_max_ph(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_max_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_max_round_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_max_round_ph
-  // CHECK: @llvm.x86.avx512fp16.max.ph.512
   return _mm512_max_round_ph(__A, __B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_max_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_max_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_max_round_ph
-  // CHECK: @llvm.x86.avx512fp16.max.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_max_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_max_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 8)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP3]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_max_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_max_round_ph
-  // CHECK: @llvm.x86.avx512fp16.max.ph.512
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_maskz_max_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_abs_ph(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__S_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x i32>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 2147450879, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[TMP1]], i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i32> [[VECINIT_I]], i32 [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i32> [[VECINIT1_I]], i32 [[TMP3]], i32 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i32> [[VECINIT2_I]], i32 [[TMP4]], i32 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i32> [[VECINIT3_I]], i32 [[TMP5]], i32 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i32> [[VECINIT4_I]], i32 [[TMP6]], i32 5
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i32> [[VECINIT5_I]], i32 [[TMP7]], i32 6
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i32> [[VECINIT6_I]], i32 [[TMP8]], i32 7
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i32> [[VECINIT7_I]], i32 [[TMP9]], i32 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i32> [[VECINIT8_I]], i32 [[TMP10]], i32 9
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i32> [[VECINIT9_I]], i32 [[TMP11]], i32 10
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i32> [[VECINIT10_I]], i32 [[TMP12]], i32 11
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i32> [[VECINIT11_I]], i32 [[TMP13]], i32 12
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i32> [[VECINIT12_I]], i32 [[TMP14]], i32 13
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i32> [[VECINIT13_I]], i32 [[TMP15]], i32 14
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[__S_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i32> [[VECINIT14_I]], i32 [[TMP16]], i32 15
+// CHECK-NEXT:    store <16 x i32> [[VECINIT15_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP17:%.*]] = load <16 x i32>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i32> [[TMP17]] to <8 x i64>
+// CHECK-NEXT:    [[TMP19:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x half> [[TMP19]] to <8 x i64>
+// CHECK-NEXT:    store <8 x i64> [[TMP18]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP20]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i64> [[TMP21]] to <16 x i32>
+// CHECK-NEXT:    [[TMP23:%.*]] = load <8 x i64>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i64> [[TMP23]] to <16 x i32>
+// CHECK-NEXT:    [[AND_I:%.*]] = and <16 x i32> [[TMP22]], [[TMP24]]
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i32> [[AND_I]] to <8 x i64>
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i64> [[TMP25]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP26]]
+//
 __m512h test_mm512_abs_ph(__m512h a) {
-  // CHECK-LABEL: @test_mm512_abs_ph
-  // CHECK: and <16 x i32>
   return _mm512_abs_ph(a);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_conj_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x half> [[TMP1]] to <16 x float>
+// CHECK-NEXT:    store float -0.000000e+00, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x float> undef, float [[TMP3]], i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x float> [[VECINIT_I]], float [[TMP4]], i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x float> [[VECINIT1_I]], float [[TMP5]], i32 2
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x float> [[VECINIT2_I]], float [[TMP6]], i32 3
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x float> [[VECINIT3_I]], float [[TMP7]], i32 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x float> [[VECINIT4_I]], float [[TMP8]], i32 5
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x float> [[VECINIT5_I]], float [[TMP9]], i32 6
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x float> [[VECINIT6_I]], float [[TMP10]], i32 7
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x float> [[VECINIT7_I]], float [[TMP11]], i32 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x float> [[VECINIT8_I]], float [[TMP12]], i32 9
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x float> [[VECINIT9_I]], float [[TMP13]], i32 10
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x float> [[VECINIT10_I]], float [[TMP14]], i32 11
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x float> [[VECINIT11_I]], float [[TMP15]], i32 12
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x float> [[VECINIT12_I]], float [[TMP16]], i32 13
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x float> [[VECINIT13_I]], float [[TMP17]], i32 14
+// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x float> [[VECINIT14_I]], float [[TMP18]], i32 15
+// CHECK-NEXT:    store <16 x float> [[VECINIT15_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP19:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP2]], ptr [[__A_ADDR_I1]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP19]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP20:%.*]] = load <16 x float>, ptr [[__A_ADDR_I1]], align 64
+// CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x float> [[TMP20]] to <16 x i32>
+// CHECK-NEXT:    [[TMP22:%.*]] = load <16 x float>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x float> [[TMP22]] to <16 x i32>
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <16 x i32> [[TMP21]], [[TMP23]]
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i32> [[XOR_I]] to <16 x float>
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x float> [[TMP24]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP25]]
+//
 __m512h test_mm512_conj_pch(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_conj_pch
-  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
-  // CHECK:  %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK:  %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
   return _mm512_conj_pch(__A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_conj_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[CONV]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x half> [[TMP5]] to <16 x float>
+// CHECK-NEXT:    store float -0.000000e+00, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x float> undef, float [[TMP7]], i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x float> [[VECINIT_I]], float [[TMP8]], i32 1
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x float> [[VECINIT1_I]], float [[TMP9]], i32 2
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x float> [[VECINIT2_I]], float [[TMP10]], i32 3
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x float> [[VECINIT3_I]], float [[TMP11]], i32 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x float> [[VECINIT4_I]], float [[TMP12]], i32 5
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x float> [[VECINIT5_I]], float [[TMP13]], i32 6
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x float> [[VECINIT6_I]], float [[TMP14]], i32 7
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x float> [[VECINIT7_I]], float [[TMP15]], i32 8
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x float> [[VECINIT8_I]], float [[TMP16]], i32 9
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x float> [[VECINIT9_I]], float [[TMP17]], i32 10
+// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x float> [[VECINIT10_I]], float [[TMP18]], i32 11
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x float> [[VECINIT11_I]], float [[TMP19]], i32 12
+// CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x float> [[VECINIT12_I]], float [[TMP20]], i32 13
+// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x float> [[VECINIT13_I]], float [[TMP21]], i32 14
+// CHECK-NEXT:    [[TMP22:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x float> [[VECINIT14_I]], float [[TMP22]], i32 15
+// CHECK-NEXT:    store <16 x float> [[VECINIT15_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP6]], ptr [[__A_ADDR_I1]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP23]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = load <16 x float>, ptr [[__A_ADDR_I1]], align 64
+// CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x float> [[TMP24]] to <16 x i32>
+// CHECK-NEXT:    [[TMP26:%.*]] = load <16 x float>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x float> [[TMP26]] to <16 x i32>
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <16 x i32> [[TMP25]], [[TMP27]]
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i32> [[XOR_I]] to <16 x float>
+// CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x float> [[TMP28]] to <32 x half>
+// CHECK-NEXT:    [[TMP30:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP31:%.*]] = bitcast <32 x half> [[TMP30]] to <16 x float>
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+// CHECK-NEXT:    [[TMP33:%.*]] = select <16 x i1> [[TMP32]], <16 x float> [[TMP28]], <16 x float> [[TMP31]]
+// CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x float> [[TMP33]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP34]]
+//
 __m512h test_mm512_mask_conj_pch(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_conj_pch
-  // CHECK:  %{{.*}} = trunc i32 %{{.*}} to i16
-  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
-  // CHECK:  %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK:  %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
-  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
-  // CHECK:  %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
   return _mm512_mask_conj_pch(__W, __U, __A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_conj_pch(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I2:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I1:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP0]] to i16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[CONV]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    store float -0.000000e+00, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x float> undef, float [[TMP6]], i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x float> [[VECINIT_I]], float [[TMP7]], i32 1
+// CHECK-NEXT:    [[TMP8:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x float> [[VECINIT1_I]], float [[TMP8]], i32 2
+// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x float> [[VECINIT2_I]], float [[TMP9]], i32 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x float> [[VECINIT3_I]], float [[TMP10]], i32 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x float> [[VECINIT4_I]], float [[TMP11]], i32 5
+// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x float> [[VECINIT5_I]], float [[TMP12]], i32 6
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x float> [[VECINIT6_I]], float [[TMP13]], i32 7
+// CHECK-NEXT:    [[TMP14:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x float> [[VECINIT7_I]], float [[TMP14]], i32 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x float> [[VECINIT8_I]], float [[TMP15]], i32 9
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x float> [[VECINIT9_I]], float [[TMP16]], i32 10
+// CHECK-NEXT:    [[TMP17:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x float> [[VECINIT10_I]], float [[TMP17]], i32 11
+// CHECK-NEXT:    [[TMP18:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x float> [[VECINIT11_I]], float [[TMP18]], i32 12
+// CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x float> [[VECINIT12_I]], float [[TMP19]], i32 13
+// CHECK-NEXT:    [[TMP20:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x float> [[VECINIT13_I]], float [[TMP20]], i32 14
+// CHECK-NEXT:    [[TMP21:%.*]] = load float, ptr [[__W_ADDR_I]], align 4
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x float> [[VECINIT14_I]], float [[TMP21]], i32 15
+// CHECK-NEXT:    store <16 x float> [[VECINIT15_I]], ptr [[DOTCOMPOUNDLITERAL_I1]], align 64
+// CHECK-NEXT:    [[TMP22:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I1]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP5]], ptr [[__A_ADDR_I2]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP22]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP23:%.*]] = load <16 x float>, ptr [[__A_ADDR_I2]], align 64
+// CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x float> [[TMP23]] to <16 x i32>
+// CHECK-NEXT:    [[TMP25:%.*]] = load <16 x float>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x float> [[TMP25]] to <16 x i32>
+// CHECK-NEXT:    [[XOR_I:%.*]] = xor <16 x i32> [[TMP24]], [[TMP26]]
+// CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i32> [[XOR_I]] to <16 x float>
+// CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x float> [[TMP27]] to <32 x half>
+// CHECK-NEXT:    store <16 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP29:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+// CHECK-NEXT:    [[TMP31:%.*]] = select <16 x i1> [[TMP30]], <16 x float> [[TMP27]], <16 x float> [[TMP29]]
+// CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x float> [[TMP31]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP32]]
+//
 __m512h test_mm512_maskz_conj_pch(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_conj_pch
-  // CHECK:  %{{.*}} = trunc i32 %{{.*}} to i16
-  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <16 x float>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <16 x i32>
-  // CHECK:  %{{.*}} = xor <16 x i32> %{{.*}}, %{{.*}}
-  // CHECK:  %{{.*}} = bitcast <16 x i32> %{{.*}} to <16 x float>
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
-  // CHECK:  %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
-  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  // CHECK:  %{{.*}} = bitcast <16 x float> %{{.*}} to <32 x half>
   return _mm512_maskz_conj_pch(__U, __A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_add_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_add_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_add_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round
   return _mm_add_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_add_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_add_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_add_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round
   return _mm_mask_add_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_add_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_add_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_add_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round
   return _mm_maskz_add_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_add_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP5]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP7]], i32 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP8]], half [[ADD_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP9]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x half> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+// CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+// CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], half [[TMP13]], half [[TMP14]]
+// CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP17]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP18]]
+//
 __m128h test_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_add_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_mask_add_sh(__W, __U, __A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_add_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP7]], half [[ADD_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP8]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x half> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP9]] to <8 x i1>
+// CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], half [[TMP12]], half [[TMP13]]
+// CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x half> [[TMP10]], half [[TMP16]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP17]]
+//
 __m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_add_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_maskz_add_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_add_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP3]], i32 0
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP4]], half [[ADD_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_add_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_add_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
   return _mm_add_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_sub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_sub_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_sub_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round
   return _mm_sub_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_sub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_sub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_sub_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round
   return _mm_mask_sub_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_sub_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_sub_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_sub_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round
   return _mm_maskz_sub_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_sub_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP5]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP7]], i32 0
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP8]], half [[SUB_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP9]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x half> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+// CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+// CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], half [[TMP13]], half [[TMP14]]
+// CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP17]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP18]]
+//
 __m128h test_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_sub_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_mask_sub_sh(__W, __U, __A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_sub_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP7]], half [[SUB_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP8]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x half> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP9]] to <8 x i1>
+// CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], half [[TMP12]], half [[TMP13]]
+// CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x half> [[TMP10]], half [[TMP16]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP17]]
+//
 __m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_sub_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_maskz_sub_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_sub_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP3]], i32 0
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP4]], half [[SUB_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_sub_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_sub_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
   return _mm_sub_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mul_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_mul_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mul_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round
   return _mm_mul_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_mul_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_mul_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round
   return _mm_mask_mul_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_mul_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_mul_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round
   return _mm_maskz_mul_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_mul_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP5]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP7]], i32 0
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP8]], half [[MUL_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP9]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x half> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+// CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+// CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], half [[TMP13]], half [[TMP14]]
+// CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP17]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP18]]
+//
 __m128h test_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_mask_mul_sh(__W, __U, __A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_mul_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP7]], half [[MUL_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP8]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x half> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP9]] to <8 x i1>
+// CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], half [[TMP12]], half [[TMP13]]
+// CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x half> [[TMP10]], half [[TMP16]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP17]]
+//
 __m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_maskz_mul_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mul_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP3]], i32 0
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP4]], half [[MUL_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_mul_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mul_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
   return _mm_mul_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_div_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_div_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_div_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round
   return _mm_div_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_div_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_div_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_div_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round
   return _mm_mask_div_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_div_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_div_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_div_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round
   return _mm_maskz_div_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_div_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP5]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP7]], i32 0
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP8]], half [[DIV_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP9]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x half> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+// CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+// CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], half [[TMP13]], half [[TMP14]]
+// CHECK-NEXT:    [[TMP18:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP17]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP18]]
+//
 __m128h test_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_div_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_mask_div_sh(__W, __U, __A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_div_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I1:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I2:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I2]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP7]], half [[DIV_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I1]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP8]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x half> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP9]] to <8 x i1>
+// CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], half [[TMP12]], half [[TMP13]]
+// CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x half> [[TMP10]], half [[TMP16]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP17]]
+//
 __m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_div_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
   return _mm_maskz_div_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_div_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT1_I:%.*]] = extractelement <8 x half> [[TMP3]], i32 0
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv half [[VECEXT1_I]], [[VECEXT_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP4]], half [[DIV_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_div_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_div_sh
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}}
-  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
   return _mm_div_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_min_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_min_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_min_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
   return _mm_min_round_sh(__A, __B, 0x08);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_min_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_min_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_min_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
   return _mm_mask_min_round_sh(__W, __U, __A, __B, 0x08);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_min_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_min_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_min_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
   return _mm_maskz_min_round_sh(__U, __A, __B, 0x08);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_min_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_min_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_min_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
   return _mm_mask_min_sh(__W, __U, __A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_min_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_min_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_min_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
   return _mm_maskz_min_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_min_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_min_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_min_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
   return _mm_min_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_max_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_max_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_max_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
   return _mm_max_round_sh(__A, __B, 0x08);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_max_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_max_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_max_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
   return _mm_mask_max_round_sh(__W, __U, __A, __B, 0x08);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_max_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_max_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_max_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
   return _mm_maskz_max_round_sh(__U, __A, __B, 0x08);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_max_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_max_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_max_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
   return _mm_mask_max_sh(__W, __U, __A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_max_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_max_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_max_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
   return _mm_maskz_max_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_max_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_max_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_max_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
   return _mm_max_sh(__A, __B);
 }
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_round_ph_mask(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_round_ph_mask(__m512h a, __m512h b) {
-  // CHECK-LABEL: @test_mm512_cmp_round_ph_mask
-  // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_round_ph_mask(a, b, 0, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_round_ph_mask(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_round_ph_mask(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_round_ph_mask
-  // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_round_ph_mask(m, a, b, 0, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_eq_oq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_eq_oq(__m512h a, __m512h b) {
-  // CHECK-LABEL: @test_mm512_cmp_ph_mask_eq_oq
-  // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_lt_os(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_lt_os(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_os
-  // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_LT_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_le_os(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_le_os(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_le_os
-  // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_LE_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_unord_q(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_unord_q(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_q
-  // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_Q);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_neq_uq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_neq_uq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_uq
-  // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_nlt_us(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp uge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_nlt_us(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_us
-  // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NLT_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_nle_us(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ugt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_nle_us(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_us
-  // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NLE_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_ord_q(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ord <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_ord_q(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_q
-  // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_ORD_Q);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_eq_uq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ueq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_eq_uq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_uq
-  // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_EQ_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_nge_us(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_nge_us(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_us
-  // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NGE_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_ngt_us(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_ngt_us(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_us
-  // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NGT_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_false_oq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp false <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_false_oq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_false_oq
-  // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_neq_oq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp one <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_neq_oq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_oq
-  // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_ge_os(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_ge_os(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_os
-  // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_GE_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_gt_os(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_gt_os(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_os
-  // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_GT_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_true_uq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp true <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_true_uq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_true_uq
-  // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_eq_os(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_eq_os(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_os
-  // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_lt_oq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp olt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_lt_oq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_oq
-  // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_le_oq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ole <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_le_oq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_le_oq
-  // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_unord_s(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_unord_s(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_s
-  // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_S);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_neq_us(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp une <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_neq_us(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_us
-  // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_nlt_uq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp uge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_nlt_uq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_uq
-  // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NLT_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_nle_uq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ugt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_nle_uq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_uq
-  // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NLE_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_ord_s(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ord <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_ord_s(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_s
-  // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_ORD_S);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_eq_us(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ueq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_eq_us(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_us
-  // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_EQ_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_nge_uq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ult <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_nge_uq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_uq
-  // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_ngt_uq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ule <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_ngt_uq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_uq
-  // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NGT_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_false_os(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp false <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_false_os(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_false_os
-  // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_neq_os(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp one <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_neq_os(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_os
-  // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_ge_oq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp oge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_ge_oq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_oq
-  // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_GE_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_gt_oq(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp ogt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_gt_oq(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_oq
-  // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_GT_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_cmp_ph_mask_true_us(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = fcmp true <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i1> [[TMP2]] to i32
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
 __mmask32 test_mm512_cmp_ph_mask_true_us(__m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_cmp_ph_mask_true_us
-  // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}}
   return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_eq_oq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_eq_oq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: @test_mm512_mask_cmp_ph_mask_eq_oq
-  // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_lt_os(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_lt_os(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_os
-  // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_le_os(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ole <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_le_os(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_os
-  // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_unord_q(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_unord_q(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_q
-  // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_neq_uq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp une <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_neq_uq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_uq
-  // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_nlt_us(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp uge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_nlt_us(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_us
-  // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_nle_us(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ugt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_nle_us(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_us
-  // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_ord_q(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ord <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_ord_q(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_q
-  // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_eq_uq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ueq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_eq_uq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_uq
-  // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_nge_us(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ult <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_nge_us(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_us
-  // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_ngt_us(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ule <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_ngt_us(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_us
-  // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_false_oq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp false <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_false_oq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_oq
-  // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_neq_oq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp one <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_neq_oq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_oq
-  // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_ge_os(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_ge_os(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_os
-  // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_gt_os(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_gt_os(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_os
-  // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_true_uq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp true <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_true_uq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_uq
-  // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_eq_os(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oeq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_eq_os(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_os
-  // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_lt_oq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_lt_oq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_oq
-  // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_le_oq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ole <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_le_oq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_oq
-  // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_unord_s(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_unord_s(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_s
-  // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_neq_us(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp une <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_neq_us(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_us
-  // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_nlt_uq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp uge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_nlt_uq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_uq
-  // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_nle_uq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ugt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_nle_uq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_uq
-  // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_ord_s(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ord <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_ord_s(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_s
-  // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_eq_us(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ueq <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_eq_us(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_us
-  // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_nge_uq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ult <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_nge_uq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_uq
-  // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_ngt_uq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ule <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_ngt_uq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_uq
-  // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_false_os(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp false <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_false_os(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_os
-  // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_neq_os(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp one <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_neq_os(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_os
-  // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_ge_oq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp oge <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_ge_oq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_oq
-  // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_gt_oq(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp ogt <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_gt_oq(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_oq
-  // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_cmp_ph_mask_true_us(
+// CHECK-SAME: i32 noundef [[M:%.*]], <32 x half> noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[M]], ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = fcmp true <32 x half> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = and <32 x i1> [[TMP3]], [[TMP4]]
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i1> [[TMP5]] to i32
+// CHECK-NEXT:    ret i32 [[TMP6]]
+//
 __mmask32 test_mm512_mask_cmp_ph_mask_true_us(__mmask32 m, __m512h a, __m512h b) {
-  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_us
-  // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}}
-  // CHECK: and <32 x i1> [[CMP]], {{.*}}
   return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US);
 }
 
+// CHECK-LABEL: define dso_local zeroext i8 @test_mm_cmp_round_sh_mask(
+// CHECK-SAME: <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 5, i8 -1, i32 8)
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 __mmask8 test_mm_cmp_round_sh_mask(__m128h __X, __m128h __Y) {
-  // CHECK-LABEL: @test_mm_cmp_round_sh_mask
-  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
   return _mm_cmp_round_sh_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local zeroext i8 @test_mm_mask_cmp_round_sh_mask(
+// CHECK-SAME: i8 noundef zeroext [[__M:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__M]], ptr [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 5, i8 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret i8 [[TMP3]]
+//
 __mmask8 test_mm_mask_cmp_round_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) {
-  // CHECK-LABEL: @test_mm_mask_cmp_round_sh_mask
-  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
   return _mm_mask_cmp_round_sh_mask(__M, __X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local zeroext i8 @test_mm_cmp_sh_mask(
+// CHECK-SAME: <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 5, i8 -1, i32 4)
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 __mmask8 test_mm_cmp_sh_mask(__m128h __X, __m128h __Y) {
-  // CHECK-LABEL: @test_mm_cmp_sh_mask
-  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
   return _mm_cmp_sh_mask(__X, __Y, _CMP_NLT_US);
 }
 
+// CHECK-LABEL: define dso_local zeroext i8 @test_mm_mask_cmp_sh_mask(
+// CHECK-SAME: i8 noundef zeroext [[__M:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__M_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__M]], ptr [[__M_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__M_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 5, i8 [[TMP2]], i32 4)
+// CHECK-NEXT:    ret i8 [[TMP3]]
+//
 __mmask8 test_mm_mask_cmp_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) {
-  // CHECK-LABEL: @test_mm_mask_cmp_sh_mask
-  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
   return _mm_mask_cmp_sh_mask(__M, __X, __Y, _CMP_NLT_US);
 }
 
 // VMOVSH
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_load_sh(
+// CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__U_I:%.*]] = alloca half, align 2
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[TMP1]], align 1
+// CHECK-NEXT:    store half [[TMP2]], ptr [[__U_I]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load half, ptr [[__U_I]], align 2
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x half> undef, half [[TMP3]], i32 0
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x half> [[VECINIT_I]], half 0xH0000, i32 1
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x half> [[VECINIT2_I]], half 0xH0000, i32 2
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x half> [[VECINIT3_I]], half 0xH0000, i32 3
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x half> [[VECINIT4_I]], half 0xH0000, i32 4
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x half> [[VECINIT5_I]], half 0xH0000, i32 5
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x half> [[VECINIT6_I]], half 0xH0000, i32 6
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <8 x half> [[VECINIT7_I]], half 0xH0000, i32 7
+// CHECK-NEXT:    store <8 x half> [[VECINIT8_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_load_sh(void const *A) {
-  // CHECK-LABEL: test_mm_load_sh
-  // CHECK: load half, ptr %{{.*}}, align 1{{$}}
   return _mm_load_sh(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_load_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], i8 noundef zeroext [[__U:%.*]], ptr noundef [[__W:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[SRC_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store ptr [[__W]], ptr [[__W_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__W_ADDR]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store ptr [[TMP2]], ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+// CHECK-NEXT:    store <8 x half> [[SHUFFLE_I]], ptr [[SRC_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[SRC_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP7]] to i32
+// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[CONV_I]], 1
+// CHECK-NEXT:    [[CONV1_I:%.*]] = trunc i32 [[AND_I]] to i8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[CONV1_I]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[TMP5]], i32 1, <8 x i1> [[TMP8]], <8 x half> [[TMP6]])
+// CHECK-NEXT:    ret <8 x half> [[TMP9]]
+//
 __m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) {
-  // CHECK-LABEL: @test_mm_mask_load_sh
-  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
   return _mm_mask_load_sh(__A, __U, __W);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_load_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], ptr noundef [[__W:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store ptr [[__W]], ptr [[__W_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__W_ADDR]], align 8
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 8
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP4]] to i32
+// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[CONV_I]], 1
+// CHECK-NEXT:    [[CONV1_I:%.*]] = trunc i32 [[AND_I]] to i8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[CONV1_I]] to <8 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0(ptr [[TMP2]], i32 1, <8 x i1> [[TMP5]], <8 x half> [[TMP3]])
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 __m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) {
-  // CHECK-LABEL: @test_mm_maskz_load_sh
-  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
   return _mm_maskz_load_sh(__U, __W);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_load_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[TMP1]], align 64
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_load_ph(void *p) {
-  // CHECK-LABEL: @test_mm512_load_ph
-  // CHECK: load <32 x half>, ptr %{{.*}}, align 64{{$}}
   return _mm512_load_ph(p);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_load_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[TMP1]], align 32
+// CHECK-NEXT:    ret <16 x half> [[TMP2]]
+//
 __m256h test_mm256_load_ph(void *p) {
-  // CHECK-LABEL: @test_mm256_load_ph
-  // CHECK: load <16 x half>, ptr %{{.*}}, align 32{{$}}
   return _mm256_load_ph(p);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_load_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[TMP1]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_load_ph(void *p) {
-  // CHECK-LABEL: @test_mm_load_ph
-  // CHECK: load <8 x half>, ptr %{{.*}}, align 16{{$}}
   return _mm_load_ph(p);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_loadu_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[TMP1]], align 1
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_loadu_ph(void *p) {
-  // CHECK-LABEL: @test_mm512_loadu_ph
-  // CHECK: load <32 x half>, ptr {{.*}}, align 1{{$}}
   return _mm512_loadu_ph(p);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm256_loadu_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[TMP1]], align 1
+// CHECK-NEXT:    ret <16 x half> [[TMP2]]
+//
 __m256h test_mm256_loadu_ph(void *p) {
-  // CHECK-LABEL: @test_mm256_loadu_ph
-  // CHECK: load <16 x half>, ptr {{.*}}, align 1{{$}}
   return _mm256_loadu_ph(p);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_loadu_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_loadu_ph(void *p) {
-  // CHECK-LABEL: @test_mm_loadu_ph
-  // CHECK: load <8 x half>, ptr {{.*}}, align 1{{$}}
   return _mm_loadu_ph(p);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm_store_sh(
+// CHECK-SAME: ptr noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 8
+// CHECK-NEXT:    store half [[VECEXT_I]], ptr [[TMP3]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_mm_store_sh(void *A, __m128h B) {
-  // CHECK-LABEL: test_mm_store_sh
-  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: store half %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm_store_sh(A, B);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm_mask_store_sh(
+// CHECK-SAME: ptr noundef [[__P:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store ptr [[__P]], ptr [[__P_ADDR]], align 8
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__W_ADDR_I]], align 8
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__W_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i8 [[TMP5]] to i32
+// CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[CONV_I]], 1
+// CHECK-NEXT:    [[CONV1_I:%.*]] = trunc i32 [[AND_I]] to i8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[CONV1_I]] to <8 x i1>
+// CHECK-NEXT:    call void @llvm.masked.store.v8f16.p0(<8 x half> [[TMP4]], ptr [[TMP3]], i32 1, <8 x i1> [[TMP6]])
+// CHECK-NEXT:    ret void
+//
 void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) {
-  // CHECK-LABEL: @test_mm_mask_store_sh
-  // CHECK: call void @llvm.masked.store.v8f16.p0(<8 x half> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}})
   _mm_mask_store_sh(__P, __U, __A);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm512_store_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]], <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[TMP3]], align 64
+// CHECK-NEXT:    ret void
+//
 void test_mm512_store_ph(void *p, __m512h a) {
-  // CHECK-LABEL: @test_mm512_store_ph
-  // CHECK: store <32 x half> %{{.*}}, ptr %{{.*}}, align 64
   _mm512_store_ph(p, a);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm256_store_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]], <16 x half> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <16 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <16 x half> [[TMP2]], ptr [[TMP3]], align 32
+// CHECK-NEXT:    ret void
+//
 void test_mm256_store_ph(void *p, __m256h a) {
-  // CHECK-LABEL: @test_mm256_store_ph
-  // CHECK: store <16 x half> %{{.*}}, ptr %{{.*}}, align 32
   _mm256_store_ph(p, a);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm_store_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]], <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[TMP3]], align 16
+// CHECK-NEXT:    ret void
+//
 void test_mm_store_ph(void *p, __m128h a) {
-  // CHECK-LABEL: @test_mm_store_ph
-  // CHECK: store <8 x half> %{{.*}}, ptr %{{.*}}, align 16
   _mm_store_ph(p, a);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm512_storeu_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]], <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[TMP3]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_mm512_storeu_ph(void *p, __m512h a) {
-  // CHECK-LABEL: @test_mm512_storeu_ph
-  // CHECK: store <32 x half> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm512_storeu_ph(p, a);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm256_storeu_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]], <16 x half> noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <16 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <16 x half> [[TMP2]], ptr [[TMP3]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_mm256_storeu_ph(void *p, __m256h a) {
-  // CHECK-LABEL: @test_mm256_storeu_ph
-  // CHECK: store <16 x half> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm256_storeu_ph(p, a);
 }
 
+// CHECK-LABEL: define dso_local void @test_mm_storeu_ph(
+// CHECK-SAME: ptr noundef [[P:%.*]], <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[TMP3]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_mm_storeu_ph(void *p, __m128h a) {
-  // CHECK-LABEL: @test_mm_storeu_ph
-  // CHECK: store <8 x half> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm_storeu_ph(p, a);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_move_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x half> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP3]], half [[VECEXT_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_move_sh(__m128h A, __m128h B) {
-  // CHECK-LABEL: test_mm_move_sh
-  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
   return _mm_move_sh(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_move_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP5]], ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP6]], ptr [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I_I:%.*]] = extractelement <8 x half> [[TMP7]], i32 0
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[VECINS_I_I:%.*]] = insertelement <8 x half> [[TMP8]], half [[VECEXT_I_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I_I]], ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x half> [[TMP9]], i64 0
+// CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x half> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+// CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
+// CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], half [[TMP11]], half [[TMP12]]
+// CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x half> [[TMP9]], half [[TMP15]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP16]]
+//
 __m128h test_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_move_sh
-  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
-  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
   return _mm_mask_move_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_move_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP5]], ptr [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I_I:%.*]] = extractelement <8 x half> [[TMP6]], i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[VECINS_I_I:%.*]] = insertelement <8 x half> [[TMP7]], half [[VECEXT_I_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I_I]], ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__A_ADDR_I_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x half> [[TMP9]], i64 0
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP10]], half [[TMP11]]
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP8]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_move_sh
-  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
-  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
   return _mm_maskz_move_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local signext i16 @test_mm_cvtsi128_si16(
+// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    [[__B_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// CHECK-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    store <8 x i16> [[TMP2]], ptr [[__B_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[__B_I]], align 16
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+// CHECK-NEXT:    ret i16 [[VECEXT_I]]
+//
 short test_mm_cvtsi128_si16(__m128i A) {
-  // CHECK-LABEL: test_mm_cvtsi128_si16
-  // CHECK: extractelement <8 x i16> %{{.*}}, i32 0
   return _mm_cvtsi128_si16(A);
 }
 
+// CHECK-LABEL: define dso_local <2 x i64> @test_mm_cvtsi16_si128(
+// CHECK-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__A_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__A_ADDR_I]], align 2
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[TMP1]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 0, i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 0, i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 0, i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 0, i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 0, i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 0, i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 0, i32 7
+// CHECK-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_cvtsi16_si128(short A) {
-  // CHECK-LABEL: test_mm_cvtsi16_si128
-  // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
   return _mm_cvtsi16_si128(A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_rcp_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> [[TMP1]], <32 x half> zeroinitializer, i32 -1)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_rcp_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_rcp_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
   return _mm512_rcp_ph(__A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_rcp_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> [[TMP3]], <32 x half> [[TMP4]], i32 [[TMP5]])
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_rcp_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
   return (__m512h)_mm512_mask_rcp_ph(__W, __U, __A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_rcp_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> [[TMP2]], <32 x half> [[TMP3]], i32 [[TMP4]])
+// CHECK-NEXT:    ret <32 x half> [[TMP5]]
+//
 __m512h test_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_rcp_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
   return _mm512_maskz_rcp_ph(__U, __A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_rsqrt_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> [[TMP1]], <32 x half> zeroinitializer, i32 -1)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_rsqrt_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_rsqrt_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
   return _mm512_rsqrt_ph(__A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_rsqrt_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> [[TMP3]], <32 x half> [[TMP4]], i32 [[TMP5]])
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_rsqrt_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
   return (__m512h)_mm512_mask_rsqrt_ph(__W, __U, __A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_rsqrt_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> [[TMP2]], <32 x half> [[TMP3]], i32 [[TMP4]])
+// CHECK-NEXT:    ret <32 x half> [[TMP5]]
+//
 __m512h test_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_rsqrt_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
   return _mm512_maskz_rsqrt_ph(__U, __A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_getmant_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> [[TMP0]], i32 9, <32 x half> zeroinitializer, i32 -1, i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP1]]
+//
 __m512h test_mm512_getmant_round_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_getmant_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
   return _mm512_getmant_round_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_getmant_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> [[TMP0]], i32 9, <32 x half> [[TMP1]], i32 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_mask_getmant_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_getmant_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
   return _mm512_mask_getmant_round_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_getmant_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> [[TMP0]], i32 9, <32 x half> [[TMP1]], i32 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_maskz_getmant_round_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getmant_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
   return _mm512_maskz_getmant_round_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_getmant_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> [[TMP0]], i32 9, <32 x half> zeroinitializer, i32 -1, i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP1]]
+//
 __m512h test_mm512_getmant_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_getmant_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
   return _mm512_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_getmant_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> [[TMP0]], i32 9, <32 x half> [[TMP1]], i32 [[TMP2]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_mask_getmant_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_getmant_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
   return _mm512_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_getmant_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> [[TMP0]], i32 9, <32 x half> [[TMP1]], i32 [[TMP2]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_maskz_getmant_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getmant_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
   return _mm512_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_scalef_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> zeroinitializer, i32 -1, i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_scalef_round_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_scalef_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
   return _mm512_scalef_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_scalef_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP4]]
+//
 __m512h test_mm512_mask_scalef_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_scalef_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
   return _mm512_mask_scalef_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_scalef_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP4]]
+//
 __m512h test_mm512_maskz_scalef_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_scalef_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
   return _mm512_maskz_scalef_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_scalef_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> [[TMP2]], <32 x half> [[TMP3]], <32 x half> zeroinitializer, i32 -1, i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP4]]
+//
 __m512h test_mm512_scalef_ph(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_scalef_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
   return _mm512_scalef_ph(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_scalef_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP6]], i32 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_scalef_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
   return _mm512_mask_scalef_ph(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_scalef_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> [[TMP3]], <32 x half> [[TMP4]], <32 x half> [[TMP5]], i32 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_scalef_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
   return _mm512_maskz_scalef_ph(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_roundscale_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[TMP0]], i32 1, <32 x half> [[TMP1]], i32 [[CONV]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_mask_roundscale_ph(__m512h __W, __mmask16 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_roundscale_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
   return _mm512_mask_roundscale_ph(__W, __U, __A, 1);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_roundscale_ph(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[TMP0]], i32 1, <32 x half> [[TMP1]], i32 [[CONV]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_maskz_roundscale_ph(__mmask16 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_roundscale_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
   return _mm512_maskz_roundscale_ph(__U, __A, 1);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_roundscale_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> [[TMP1]], i32 [[CONV]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_mask_roundscale_round_ph(__m512h __A, __mmask16 __U, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_roundscale_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
   return _mm512_mask_roundscale_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_roundscale_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> [[TMP1]], i32 [[CONV]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_maskz_roundscale_round_ph(__m512h __A, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
   return _mm512_maskz_roundscale_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_roundscale_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> zeroinitializer, i32 -1, i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP1]]
+//
 __m512h test_mm512_roundscale_round_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_roundscale_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
   return _mm512_roundscale_round_ph(__A, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_roundscale_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> [[TMP1]], i32 -1, i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_roundscale_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_roundscale_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
   return _mm512_roundscale_ph(__A, 3);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_getexp_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> [[TMP0]], <32 x half> zeroinitializer, i32 -1, i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP1]]
+//
 __m512h test_mm512_getexp_round_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_getexp_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
   return _mm512_getexp_round_ph(__A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_getexp_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_mask_getexp_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_getexp_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
   return _mm512_mask_getexp_round_ph(__W, __U, __A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_getexp_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], i32 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_maskz_getexp_round_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getexp_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
   return _mm512_maskz_getexp_round_ph(__U, __A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_getexp_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> [[TMP1]], <32 x half> zeroinitializer, i32 -1, i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_getexp_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_getexp_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
   return _mm512_getexp_ph(__A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_getexp_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> [[TMP3]], <32 x half> [[TMP4]], i32 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_getexp_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
   return _mm512_mask_getexp_ph(__W, __U, __A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_getexp_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> [[TMP2]], <32 x half> [[TMP3]], i32 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP5]]
+//
 __m512h test_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_getexp_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
   return _mm512_maskz_getexp_ph(__U, __A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_reduce_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> [[TMP0]], i32 1, <32 x half> [[TMP1]], i32 [[CONV]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_mask_reduce_ph(__m512h __W, __mmask16 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_reduce_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
   return _mm512_mask_reduce_ph(__W, __U, __A, 1);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_reduce_ph(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> [[TMP0]], i32 1, <32 x half> [[TMP1]], i32 [[CONV]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_maskz_reduce_ph(__mmask16 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_reduce_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
   return _mm512_maskz_reduce_ph(__U, __A, 1);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_reduce_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> [[TMP1]], i32 [[CONV]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_mask_reduce_round_ph(__m512h __A, __mmask16 __U, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_reduce_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
   return _mm512_mask_reduce_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_reduce_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> [[TMP1]], i32 [[CONV]], i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_maskz_reduce_round_ph(__m512h __A, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_maskz_reduce_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
   return _mm512_maskz_reduce_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_reduce_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> zeroinitializer, i32 -1, i32 8)
+// CHECK-NEXT:    ret <32 x half> [[TMP1]]
+//
 __m512h test_mm512_reduce_round_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_reduce_round_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
   return _mm512_reduce_round_ph(__A, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_reduce_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> [[TMP0]], i32 3, <32 x half> zeroinitializer, i32 -1, i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP1]]
+//
 __m512h test_mm512_reduce_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_reduce_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
   return _mm512_reduce_ph(__A, 3);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_rcp_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], i8 -1)
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_rcp_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_rcp_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
   return _mm_rcp_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_rcp_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]])
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_rcp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_rcp_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
   return _mm_mask_rcp_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_rcp_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]])
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_rcp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_rcp_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
   return _mm_maskz_rcp_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_rsqrt_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], i8 -1)
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_rsqrt_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_rsqrt_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
   return _mm_rsqrt_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_rsqrt_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]])
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_rsqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_rsqrt_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
   return _mm_mask_rsqrt_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_rsqrt_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]])
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_rsqrt_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
   return _mm_maskz_rsqrt_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_getmant_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 0, <8 x half> [[TMP2]], i8 -1, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_getmant_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_getmant_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
   return _mm_getmant_round_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_getmant_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 0, <8 x half> [[TMP2]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_getmant_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_getmant_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
   return _mm_getmant_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_getmant_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 9, <8 x half> [[TMP2]], i8 [[TMP3]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_getmant_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_getmant_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
   return _mm_mask_getmant_sh(__W, __U, __A, __B, 1, 2);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_getmant_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 9, <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_getmant_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_getmant_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
   return _mm_mask_getmant_round_sh(__W, __U, __A, __B, 1, 2, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_getmant_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 9, <8 x half> [[TMP2]], i8 [[TMP3]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_getmant_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_getmant_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
   return _mm_maskz_getmant_sh(__U, __A, __B, 1, 2);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_getmant_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], i32 9, <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_getmant_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_getmant_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
   return _mm_maskz_getmant_round_sh(__U, __A, __B, 1, 2, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_getexp_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_getexp_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_getexp_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
   return _mm_getexp_round_sh(__A, __B, 8);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_getexp_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_getexp_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_getexp_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
   return _mm_getexp_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_getexp_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_getexp_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
   return _mm_mask_getexp_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_getexp_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_getexp_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_getexp_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
   return _mm_mask_getexp_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_getexp_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_getexp_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
   return _mm_maskz_getexp_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_getexp_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_getexp_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_getexp_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
   return _mm_maskz_getexp_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_scalef_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_scalef_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_scalef_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11)
   return _mm_scalef_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_scalef_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x half> [[TMP4]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm_scalef_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_scalef_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
   return _mm_scalef_sh(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_scalef_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> [[TMP4]], <8 x half> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_scalef_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
   return _mm_mask_scalef_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_scalef_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_scalef_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_scalef_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11)
   return _mm_mask_scalef_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_scalef_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> [[TMP3]], <8 x half> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_scalef_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
   return _mm_maskz_scalef_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_scalef_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_scalef_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_scalef_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11)
   return _mm_maskz_scalef_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_roundscale_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 3, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_roundscale_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_roundscale_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
   return _mm_roundscale_round_sh(__A, __B, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_roundscale_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 3, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_roundscale_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_roundscale_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
   return _mm_roundscale_sh(__A, __B, 3);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_roundscale_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 3, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_roundscale_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_roundscale_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
   return _mm_mask_roundscale_sh(__W, __U, __A, __B, 3);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_roundscale_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 3, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_roundscale_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_roundscale_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
   return _mm_mask_roundscale_round_sh(__W, __U, __A, __B, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_roundscale_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 3, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_roundscale_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_roundscale_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
   return _mm_maskz_roundscale_round_sh(__U, __A, __B, 3, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_roundscale_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 3, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_roundscale_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_roundscale_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
   return _mm_maskz_roundscale_sh(__U, __A, __B, 3);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_reduce_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 4, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_reduce_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_reduce_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
   return _mm_reduce_sh(__A, __B, 4);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_reduce_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 4, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_reduce_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_reduce_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
   return _mm_mask_reduce_sh(__W, __U, __A, __B, 4);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_reduce_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 4, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_reduce_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
   return _mm_maskz_reduce_sh(__U, __A, __B, 4);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_reduce_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 4, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_reduce_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_reduce_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
   return _mm_reduce_round_sh(__A, __B, 4, 8);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_reduce_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 4, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_reduce_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_reduce_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
   return _mm_mask_reduce_round_sh(__W, __U, __A, __B, 4, 8);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_reduce_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 4, i32 8)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_reduce_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_reduce_round_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
   return _mm_maskz_reduce_round_sh(__U, __A, __B, 4, 8);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_sqrt_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> [[TMP0]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP1]]
+//
 __m512h test_mm512_sqrt_round_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_sqrt_round_ph
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
   return _mm512_sqrt_round_ph(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_sqrt_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x half> [[TMP2]], <32 x half> [[TMP3]]
+// CHECK-NEXT:    ret <32 x half> [[TMP5]]
+//
 __m512h test_mm512_mask_sqrt_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_sqrt_round_ph
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_sqrt_round_ph(__W, __U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_sqrt_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> [[TMP1]], i32 11)
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP0]] to <32 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = select <32 x i1> [[TMP4]], <32 x half> [[TMP2]], <32 x half> [[TMP3]]
+// CHECK-NEXT:    ret <32 x half> [[TMP5]]
+//
 __m512h test_mm512_maskz_sqrt_round_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ph
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}}
   return _mm512_maskz_sqrt_round_ph(__U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_sqrt_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.sqrt.v32f16(<32 x half> [[TMP1]])
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_sqrt_ph(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_sqrt_ph
-  // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
   return _mm512_sqrt_ph(__A);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_sqrt_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = call <32 x half> @llvm.sqrt.v32f16(<32 x half> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <32 x i1> [[TMP7]], <32 x half> [[TMP5]], <32 x half> [[TMP6]]
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_sqrt_ph
-  // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_sqrt_ph(__W, __U, __A);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_sqrt_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.sqrt.v32f16(<32 x half> [[TMP3]])
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+// CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP6]], <32 x half> [[TMP4]], <32 x half> [[TMP5]]
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_maskz_sqrt_ph
-  // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}}
   return _mm512_maskz_sqrt_ph(__U, __A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_sqrt_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm_sqrt_round_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_sqrt_round_sh
-  // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11)
   return _mm_sqrt_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_sqrt_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_sqrt_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_sqrt_round_sh
-  // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11)
   return _mm_mask_sqrt_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_sqrt_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_sqrt_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_sqrt_round_sh
-  // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11)
   return _mm_maskz_sqrt_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_sqrt_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP3]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call half @llvm.sqrt.f16(half [[TMP5]])
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP6]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_sqrt_sh(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_sqrt_sh
-  // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
   return _mm_sqrt_sh(__A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_sqrt_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP1]] to i32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i32 [[CONV]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP7]] to i8
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = call half @llvm.sqrt.f16(half [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[CONV_I]] to <8 x i1>
+// CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
+// CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TMP12]], half [[TMP9]], half [[TMP10]]
+// CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP13]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP14]]
+//
 __m128h test_mm_mask_sqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_sqrt_sh
-  // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
   return _mm_mask_sqrt_sh(__W, __U, __A, __B);
 }
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_sqrt_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[CONV]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[TMP6]] to i8
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = call half @llvm.sqrt.f16(half [[TMP7]])
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[CONV_I]] to <8 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], half [[TMP8]], half [[TMP9]]
+// CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP12]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP13]]
+//
 __m128h test_mm_maskz_sqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_sqrt_sh
-  // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
   return _mm_maskz_sqrt_sh(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_mask_fpclass_ph_mask(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> [[TMP0]], i32 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP1]] to <32 x i1>
+// CHECK-NEXT:    [[TMP4:%.*]] = and <32 x i1> [[TMP2]], [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i1> [[TMP4]] to i32
+// CHECK-NEXT:    ret i32 [[TMP5]]
+//
 __mmask32 test_mm512_mask_fpclass_ph_mask(__mmask32 __U, __m512h __A) {
-  // CHECK-LABEL: @test_mm512_mask_fpclass_ph_mask
-  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512
   return _mm512_mask_fpclass_ph_mask(__U, __A, 4);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm512_fpclass_ph_mask(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> [[TMP0]], i32 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <32 x i1> [[TMP1]] to i32
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 __mmask32 test_mm512_fpclass_ph_mask(__m512h __A) {
-  // CHECK-LABEL: @test_mm512_fpclass_ph_mask
-  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512
   return _mm512_fpclass_ph_mask(__A, 4);
 }
 
+// CHECK-LABEL: define dso_local zeroext i8 @test_mm_fpclash_sh_mask(
+// CHECK-SAME: <4 x float> noundef [[__A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <4 x float> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> [[TMP1]], i32 2, i8 -1)
+// CHECK-NEXT:    ret i8 [[TMP2]]
+//
 __mmask8 test_mm_fpclash_sh_mask(__m128 __A) {
-  // CHECK-LABEL: @test_mm_fpclash_sh_mask
-  // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh
   return _mm_fpclass_sh_mask(__A, 2);
 }
 
+// CHECK-LABEL: define dso_local zeroext i8 @test_mm_mask_fpclash_sh_mask(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <4 x float> noundef [[__A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <4 x float> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> [[TMP1]], i32 2, i8 [[TMP2]])
+// CHECK-NEXT:    ret i8 [[TMP3]]
+//
 __mmask8 test_mm_mask_fpclash_sh_mask(__mmask8 __U, __m128 __A) {
-  // CHECK-LABEL: @test_mm_mask_fpclash_sh_mask
-  // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh
   return _mm_mask_fpclass_sh_mask(__U, __A, 2);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_cvt_roundpd_ph(
+// CHECK-SAME: <8 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store <8 x double> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> [[TMP0]], <8 x half> zeroinitializer, i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
 __m128h test_mm512_cvt_roundpd_ph(__m512d A) {
-  // CHECK-LABEL: test_mm512_cvt_roundpd_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
   return _mm512_cvt_roundpd_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_mask_cvt_roundpd_ph(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x double> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> [[TMP0]], <8 x half> [[TMP1]], i8 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm512_mask_cvt_roundpd_ph(__m128h A, __mmask8 B, __m512d C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundpd_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
   return _mm512_mask_cvt_roundpd_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_maskz_cvt_roundpd_ph(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x double> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> [[TMP0]], <8 x half> [[TMP1]], i8 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm512_maskz_cvt_roundpd_ph(__mmask8 A, __m512d B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundpd_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
   return _mm512_maskz_cvt_roundpd_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_cvtpd_ph(
+// CHECK-SAME: <8 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store <8 x double> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> [[TMP1]], <8 x half> [[TMP2]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm512_cvtpd_ph(__m512d A) {
-  // CHECK-LABEL: test_mm512_cvtpd_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
   return _mm512_cvtpd_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_mask_cvtpd_ph(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x double> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x double> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x double> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x double>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> [[TMP3]], <8 x half> [[TMP4]], i8 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 __m128h test_mm512_mask_cvtpd_ph(__m128h A, __mmask8 B, __m512d C) {
-  // CHECK-LABEL: test_mm512_mask_cvtpd_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
   return _mm512_mask_cvtpd_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_maskz_cvtpd_ph(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x double> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> [[TMP2]], <8 x half> [[TMP3]], i8 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm512_maskz_cvtpd_ph(__mmask8 A, __m512d B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtpd_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
   return _mm512_maskz_cvtpd_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x double> @test_mm512_cvt_roundph_pd(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> [[TMP0]], <8 x double> zeroinitializer, i8 -1, i32 8)
+// CHECK-NEXT:    ret <8 x double> [[TMP1]]
+//
 __m512d test_mm512_cvt_roundph_pd(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvt_roundph_pd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
   return _mm512_cvt_roundph_pd(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x double> @test_mm512_mask_cvt_roundph_pd(
+// CHECK-SAME: <8 x double> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x double> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> [[TMP0]], <8 x double> [[TMP1]], i8 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <8 x double> [[TMP3]]
+//
 __m512d test_mm512_mask_cvt_roundph_pd(__m512d A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundph_pd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
   return _mm512_mask_cvt_roundph_pd(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x double> @test_mm512_maskz_cvt_roundph_pd(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> [[TMP0]], <8 x double> [[TMP1]], i8 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <8 x double> [[TMP3]]
+//
 __m512d test_mm512_maskz_cvt_roundph_pd(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_pd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
   return _mm512_maskz_cvt_roundph_pd(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x double> @test_mm512_cvtph_pd(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x double>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> [[TMP1]], <8 x double> [[TMP2]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x double> [[TMP3]]
+//
 __m512d test_mm512_cvtph_pd(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvtph_pd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
   return _mm512_cvtph_pd(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x double> @test_mm512_mask_cvtph_pd(
+// CHECK-SAME: <8 x double> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x double> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x double> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x double>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> [[TMP3]], <8 x double> [[TMP4]], i8 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <8 x double> [[TMP6]]
+//
 __m512d test_mm512_mask_cvtph_pd(__m512d A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtph_pd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
   return _mm512_mask_cvtph_pd(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x double> @test_mm512_maskz_cvtph_pd(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x double>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x double>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> [[TMP2]], <8 x double> [[TMP3]], i8 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <8 x double> [[TMP5]]
+//
 __m512d test_mm512_maskz_cvtph_pd(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtph_pd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
   return _mm512_maskz_cvtph_pd(A, B);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_cvt_roundsh_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> [[TMP0]], <8 x half> [[TMP1]], <4 x float> zeroinitializer, i8 -1, i32 8)
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
 __m128 test_mm_cvt_roundsh_ss(__m128 A, __m128h B) {
-  // CHECK-LABEL: test_mm_cvt_roundsh_ss
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
   return _mm_cvt_roundsh_ss(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_mask_cvt_roundsh_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <4 x float> noundef [[C:%.*]], <8 x half> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> [[TMP0]], <8 x half> [[TMP1]], <4 x float> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
 __m128 test_mm_mask_cvt_roundsh_ss(__m128 A, __mmask8 B, __m128 C, __m128h D) {
-  // CHECK-LABEL: test_mm_mask_cvt_roundsh_ss
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
   return _mm_mask_cvt_roundsh_ss(A, B, C, D, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_maskz_cvt_roundsh_ss(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <4 x float> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> [[TMP0]], <8 x half> [[TMP1]], <4 x float> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
 __m128 test_mm_maskz_cvt_roundsh_ss(__mmask8 A, __m128 B, __m128h C) {
-  // CHECK-LABEL: test_mm_maskz_cvt_roundsh_ss
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
   return _mm_maskz_cvt_roundsh_ss(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_cvtsh_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> [[TMP2]], <8 x half> [[TMP3]], <4 x float> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
 __m128 test_mm_cvtsh_ss(__m128 A, __m128h B) {
-  // CHECK-LABEL: test_mm_cvtsh_ss
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
   return _mm_cvtsh_ss(A, B);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_mask_cvtsh_ss(
+// CHECK-SAME: <4 x float> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <4 x float> noundef [[C:%.*]], <8 x half> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <4 x float> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> [[TMP4]], <8 x half> [[TMP5]], <4 x float> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <4 x float> [[TMP8]]
+//
 __m128 test_mm_mask_cvtsh_ss(__m128 A, __mmask8 B, __m128 C, __m128h D) {
-  // CHECK-LABEL: test_mm_mask_cvtsh_ss
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
   return _mm_mask_cvtsh_ss(A, B, C, D);
 }
 
+// CHECK-LABEL: define dso_local <4 x float> @test_mm_maskz_cvtsh_ss(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <4 x float> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <4 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> [[TMP3]], <8 x half> [[TMP4]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <4 x float> [[TMP7]]
+//
 __m128 test_mm_maskz_cvtsh_ss(__mmask8 A, __m128 B, __m128h C) {
-  // CHECK-LABEL: test_mm_maskz_cvtsh_ss
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
   return _mm_maskz_cvtsh_ss(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvt_roundss_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> [[TMP0]], <4 x float> [[TMP1]], <8 x half> zeroinitializer, i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_cvt_roundss_sh(__m128h A, __m128 B) {
-  // CHECK-LABEL: test_mm_cvt_roundss_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
   return _mm_cvt_roundss_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_cvt_roundss_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]], <4 x float> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> [[TMP0]], <4 x float> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_cvt_roundss_sh(__m128h A, __mmask8 B, __m128h C, __m128 D) {
-  // CHECK-LABEL: test_mm_mask_cvt_roundss_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
   return _mm_mask_cvt_roundss_sh(A, B, C, D, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_cvt_roundss_sh(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> [[TMP0]], <4 x float> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_cvt_roundss_sh(__mmask8 A, __m128h B, __m128 C) {
-  // CHECK-LABEL: test_mm_maskz_cvt_roundss_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
   return _mm_maskz_cvt_roundss_sh(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvtss_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> [[TMP2]], <4 x float> [[TMP3]], <8 x half> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_cvtss_sh(__m128h A, __m128 B) {
-  // CHECK-LABEL: test_mm_cvtss_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
   return _mm_cvtss_sh(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_cvtss_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]], <4 x float> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> [[TMP4]], <4 x float> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_cvtss_sh(__m128h A, __mmask8 B, __m128h C, __m128 D) {
-  // CHECK-LABEL: test_mm_mask_cvtss_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
   return _mm_mask_cvtss_sh(A, B, C, D);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_cvtss_sh(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <4 x float> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> [[TMP3]], <4 x float> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_cvtss_sh(__mmask8 A, __m128h B, __m128 C) {
-  // CHECK-LABEL: test_mm_maskz_cvtss_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
   return _mm_maskz_cvtss_sh(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvt_roundsd_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> [[TMP0]], <2 x double> [[TMP1]], <8 x half> zeroinitializer, i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_cvt_roundsd_sh(__m128h A, __m128d B) {
-  // CHECK-LABEL: test_mm_cvt_roundsd_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
   return _mm_cvt_roundsd_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_cvt_roundsd_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]], <2 x double> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> [[TMP0]], <2 x double> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_mask_cvt_roundsd_sh(__m128h A, __mmask8 B, __m128h C, __m128d D) {
-  // CHECK-LABEL: test_mm_mask_cvt_roundsd_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
   return _mm_mask_cvt_roundsd_sh(A, B, C, D, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_cvt_roundsd_sh(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> [[TMP0]], <2 x double> [[TMP1]], <8 x half> [[TMP2]], i8 [[TMP3]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_maskz_cvt_roundsd_sh(__mmask8 A, __m128h B, __m128d C) {
-  // CHECK-LABEL: test_mm_maskz_cvt_roundsd_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
   return _mm_maskz_cvt_roundsd_sh(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvtsd_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> [[TMP2]], <2 x double> [[TMP3]], <8 x half> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_cvtsd_sh(__m128h A, __m128d B) {
-  // CHECK-LABEL: test_mm_cvtsd_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
   return _mm_cvtsd_sh(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_cvtsd_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]], <2 x double> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> [[TMP4]], <2 x double> [[TMP5]], <8 x half> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_cvtsd_sh(__m128h A, __mmask8 B, __m128h C, __m128d D) {
-  // CHECK-LABEL: test_mm_mask_cvtsd_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
   return _mm_mask_cvtsd_sh(A, B, C, D);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_cvtsd_sh(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]], <2 x double> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> [[TMP3]], <2 x double> [[TMP4]], <8 x half> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_maskz_cvtsd_sh(__mmask8 A, __m128h B, __m128d C) {
-  // CHECK-LABEL: test_mm_maskz_cvtsd_sh
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
   return _mm_maskz_cvtsd_sh(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_cvt_roundsh_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> [[TMP0]], <8 x half> [[TMP1]], <2 x double> zeroinitializer, i8 -1, i32 8)
+// CHECK-NEXT:    ret <2 x double> [[TMP2]]
+//
 __m128d test_mm_cvt_roundsh_sd(__m128d A, __m128h B) {
-  // CHECK-LABEL: test_mm_cvt_roundsh_sd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
   return _mm_cvt_roundsh_sd(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_mask_cvt_roundsh_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <2 x double> noundef [[C:%.*]], <8 x half> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <2 x double> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> [[TMP0]], <8 x half> [[TMP1]], <2 x double> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <2 x double> [[TMP4]]
+//
 __m128d test_mm_mask_cvt_roundsh_sd(__m128d A, __mmask8 B, __m128d C, __m128h D) {
-  // CHECK-LABEL: test_mm_mask_cvt_roundsh_sd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
   return _mm_mask_cvt_roundsh_sd(A, B, C, D, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_maskz_cvt_roundsh_sd(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <2 x double> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> [[TMP0]], <8 x half> [[TMP1]], <2 x double> [[TMP2]], i8 [[TMP3]], i32 8)
+// CHECK-NEXT:    ret <2 x double> [[TMP4]]
+//
 __m128d test_mm_maskz_cvt_roundsh_sd(__mmask8 A, __m128d B, __m128h C) {
-  // CHECK-LABEL: test_mm_maskz_cvt_roundsh_sd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
   return _mm_maskz_cvt_roundsh_sd(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_cvtsh_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> [[TMP2]], <8 x half> [[TMP3]], <2 x double> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    ret <2 x double> [[TMP4]]
+//
 __m128d test_mm_cvtsh_sd(__m128d A, __m128h B) {
-  // CHECK-LABEL: test_mm_cvtsh_sd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
   return _mm_cvtsh_sd(A, B);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_mask_cvtsh_sd(
+// CHECK-SAME: <2 x double> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <2 x double> noundef [[C:%.*]], <8 x half> noundef [[D:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[D_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <2 x double> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[D]], ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[D_ADDR]], align 16
+// CHECK-NEXT:    store <2 x double> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> [[TMP4]], <8 x half> [[TMP5]], <2 x double> [[TMP6]], i8 [[TMP7]], i32 4)
+// CHECK-NEXT:    ret <2 x double> [[TMP8]]
+//
 __m128d test_mm_mask_cvtsh_sd(__m128d A, __mmask8 B, __m128d C, __m128h D) {
-  // CHECK-LABEL: test_mm_mask_cvtsh_sd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
   return _mm_mask_cvtsh_sd(A, B, C, D);
 }
 
+// CHECK-LABEL: define dso_local <2 x double> @test_mm_maskz_cvtsh_sd(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <2 x double> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <2 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> [[TMP3]], <8 x half> [[TMP4]], <2 x double> [[TMP5]], i8 [[TMP6]], i32 4)
+// CHECK-NEXT:    ret <2 x double> [[TMP7]]
+//
 __m128d test_mm_maskz_cvtsh_sd(__mmask8 A, __m128d B, __m128h C) {
-  // CHECK-LABEL: test_mm_maskz_cvtsh_sd
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
   return _mm_maskz_cvtsh_sd(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvt_roundph_epi16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> [[TMP0]], <32 x i16> [[TMP1]], i32 -1, i32 11)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvt_roundph_epi16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvt_roundph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
   return _mm512_cvt_roundph_epi16(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvt_roundph_epi16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvt_roundph_epi16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
   return _mm512_mask_cvt_roundph_epi16(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvt_roundph_epi16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvt_roundph_epi16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
   return _mm512_maskz_cvt_roundph_epi16(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtph_epi16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> [[TMP1]], <32 x i16> [[TMP3]], i32 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvtph_epi16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvtph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
   return _mm512_cvtph_epi16(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtph_epi16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <32 x i16>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> [[TMP3]], <32 x i16> [[TMP5]], i32 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvtph_epi16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
   return _mm512_mask_cvtph_epi16(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtph_epi16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <32 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> [[TMP2]], <32 x i16> [[TMP4]], i32 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i16> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvtph_epi16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
   return _mm512_maskz_cvtph_epi16(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtt_roundph_epi16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> [[TMP0]], <32 x i16> [[TMP1]], i32 -1, i32 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvtt_roundph_epi16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvtt_roundph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
   return _mm512_cvtt_roundph_epi16(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtt_roundph_epi16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvtt_roundph_epi16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
   return _mm512_mask_cvtt_roundph_epi16(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtt_roundph_epi16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvtt_roundph_epi16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
   return _mm512_maskz_cvtt_roundph_epi16(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvttph_epi16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> [[TMP1]], <32 x i16> [[TMP3]], i32 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvttph_epi16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvttph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
   return _mm512_cvttph_epi16(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvttph_epi16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <32 x i16>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> [[TMP3]], <32 x i16> [[TMP5]], i32 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvttph_epi16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvttph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
   return _mm512_mask_cvttph_epi16(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvttph_epi16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <32 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> [[TMP2]], <32 x i16> [[TMP4]], i32 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i16> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvttph_epi16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvttph_epi16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
   return _mm512_maskz_cvttph_epi16(A, B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_cvt_roundepi16_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_cvt_roundepi16_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvt_roundepi16_ph
-  // CHECK:   @llvm.x86.avx512.sitofp.round.v32f16.v32i16
   return _mm512_cvt_roundepi16_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_cvt_roundepi16_ph(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_cvt_roundepi16_ph(__m512h A, __mmask32 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundepi16_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v32f16.v32i16
   return _mm512_mask_cvt_roundepi16_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_cvt_roundepi16_ph(
+// CHECK-SAME: i32 noundef [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <32 x i16>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_cvt_roundepi16_ph(__mmask32 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi16_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v32f16.v32i16
   return _mm512_maskz_cvt_roundepi16_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_cvtepi16_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = sitofp <32 x i16> [[TMP2]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP4]]
+//
 __m512h test_mm512_cvtepi16_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvtepi16_ph
-  // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half>
   return _mm512_cvtepi16_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_cvtepi16_ph(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <32 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = sitofp <32 x i16> [[TMP4]] to <32 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP6]] to <32 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = select <32 x i1> [[TMP8]], <32 x half> [[TMP7]], <32 x half> [[TMP5]]
+// CHECK-NEXT:    ret <32 x half> [[TMP9]]
+//
 __m512h test_mm512_mask_cvtepi16_ph(__m512h A, __mmask32 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvtepi16_ph
-  // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half>
   return _mm512_mask_cvtepi16_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_cvtepi16_ph(
+// CHECK-SAME: i32 noundef [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = sitofp <32 x i16> [[TMP3]] to <32 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP5]] to <32 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <32 x i1> [[TMP7]], <32 x half> [[TMP6]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_cvtepi16_ph(__mmask32 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtepi16_ph
-  // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half>
   return _mm512_maskz_cvtepi16_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvt_roundph_epu16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> [[TMP0]], <32 x i16> [[TMP1]], i32 -1, i32 11)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvt_roundph_epu16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvt_roundph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
   return _mm512_cvt_roundph_epu16(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvt_roundph_epu16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvt_roundph_epu16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
   return _mm512_mask_cvt_roundph_epu16(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvt_roundph_epu16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvt_roundph_epu16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
   return _mm512_maskz_cvt_roundph_epu16(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtph_epu16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> [[TMP1]], <32 x i16> [[TMP3]], i32 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvtph_epu16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvtph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
   return _mm512_cvtph_epu16(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtph_epu16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <32 x i16>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> [[TMP3]], <32 x i16> [[TMP5]], i32 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvtph_epu16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
   return _mm512_mask_cvtph_epu16(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtph_epu16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <32 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> [[TMP2]], <32 x i16> [[TMP4]], i32 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i16> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvtph_epu16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
   return _mm512_maskz_cvtph_epu16(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtt_roundph_epu16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> [[TMP0]], <32 x i16> [[TMP1]], i32 -1, i32 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x i16> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvtt_roundph_epu16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvtt_roundph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
   return _mm512_cvtt_roundph_epu16(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtt_roundph_epu16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvtt_roundph_epu16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
   return _mm512_mask_cvtt_roundph_epu16(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtt_roundph_epu16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> [[TMP0]], <32 x i16> [[TMP2]], i32 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvtt_roundph_epu16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
   return _mm512_maskz_cvtt_roundph_epu16(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvttph_epu16(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> [[TMP1]], <32 x i16> [[TMP3]], i32 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvttph_epu16(__m512h A) {
-  // CHECK-LABEL: test_mm512_cvttph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
   return _mm512_cvttph_epu16(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvttph_epu16(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]], <32 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <32 x i16>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> [[TMP3]], <32 x i16> [[TMP5]], i32 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x i16> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvttph_epu16(__m512i A, __mmask32 B, __m512h C) {
-  // CHECK-LABEL: test_mm512_mask_cvttph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
   return _mm512_mask_cvttph_epu16(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvttph_epu16(
+// CHECK-SAME: i32 noundef [[A:%.*]], <32 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <32 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> [[TMP2]], <32 x i16> [[TMP4]], i32 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i16> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvttph_epu16(__mmask32 A, __m512h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvttph_epu16
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
   return _mm512_maskz_cvttph_epu16(A, B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_cvt_roundepu16_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP2]]
+//
 __m512h test_mm512_cvt_roundepu16_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvt_roundepu16_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16
   return _mm512_cvt_roundepu16_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_cvt_roundepu16_ph(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <32 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_cvt_roundepu16_ph(__m512h A, __mmask32 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundepu16_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16
   return _mm512_mask_cvt_roundepu16_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_cvt_roundepu16_ph(
+// CHECK-SAME: i32 noundef [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <32 x i16>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_cvt_roundepu16_ph(__mmask32 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu16_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16
   return _mm512_maskz_cvt_roundepu16_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_cvtepu16_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <32 x i16>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = uitofp <32 x i16> [[TMP2]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP4]]
+//
 __m512h test_mm512_cvtepu16_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvtepu16_ph
-  // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half>
   return _mm512_cvtepu16_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_cvtepu16_ph(
+// CHECK-SAME: <32 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <32 x half> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <32 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = uitofp <32 x i16> [[TMP4]] to <32 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP6]] to <32 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = select <32 x i1> [[TMP8]], <32 x half> [[TMP7]], <32 x half> [[TMP5]]
+// CHECK-NEXT:    ret <32 x half> [[TMP9]]
+//
 __m512h test_mm512_mask_cvtepu16_ph(__m512h A, __mmask32 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvtepu16_ph
-  // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half>
   return _mm512_mask_cvtepu16_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_cvtepu16_ph(
+// CHECK-SAME: i32 noundef [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = uitofp <32 x i16> [[TMP3]] to <32 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP5]] to <32 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <32 x i1> [[TMP7]], <32 x half> [[TMP6]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_cvtepu16_ph(__mmask32 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtepu16_ph
-  // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half>
   return _mm512_maskz_cvtepu16_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvt_roundph_epi32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP1]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvt_roundph_epi32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvt_roundph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
   return _mm512_cvt_roundph_epi32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvt_roundph_epi32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvt_roundph_epi32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
   return _mm512_mask_cvt_roundph_epi32(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvt_roundph_epi32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvt_roundph_epi32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
   return _mm512_maskz_cvt_roundph_epi32(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtph_epi32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> [[TMP1]], <16 x i32> [[TMP3]], i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvtph_epi32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvtph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
   return _mm512_cvtph_epi32(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtph_epi32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <16 x i32>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> [[TMP3]], <16 x i32> [[TMP5]], i16 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvtph_epi32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
   return _mm512_mask_cvtph_epi32(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtph_epi32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> [[TMP2]], <16 x i32> [[TMP4]], i16 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvtph_epi32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
   return _mm512_maskz_cvtph_epi32(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvt_roundph_epu32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP1]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvt_roundph_epu32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvt_roundph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
   return _mm512_cvt_roundph_epu32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvt_roundph_epu32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvt_roundph_epu32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
   return _mm512_mask_cvt_roundph_epu32(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvt_roundph_epu32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvt_roundph_epu32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
   return _mm512_maskz_cvt_roundph_epu32(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtph_epu32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> [[TMP1]], <16 x i32> [[TMP3]], i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvtph_epu32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvtph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
   return _mm512_cvtph_epu32(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtph_epu32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <16 x i32>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> [[TMP3]], <16 x i32> [[TMP5]], i16 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvtph_epu32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
   return _mm512_mask_cvtph_epu32(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtph_epu32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> [[TMP2]], <16 x i32> [[TMP4]], i16 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvtph_epu32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
   return _mm512_maskz_cvtph_epu32(A, B);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_cvt_roundepi32_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <16 x half> [[TMP2]]
+//
 __m256h test_mm512_cvt_roundepi32_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvt_roundepi32_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32
   return _mm512_cvt_roundepi32_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_mask_cvt_roundepi32_ph(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x half> [[TMP4]], <16 x half> [[TMP2]]
+// CHECK-NEXT:    ret <16 x half> [[TMP6]]
+//
 __m256h test_mm512_mask_cvt_roundepi32_ph(__m256h A, __mmask16 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundepi32_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32
   return _mm512_mask_cvt_roundepi32_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_maskz_cvt_roundepi32_ph(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x half> [[TMP4]], <16 x half> [[TMP2]]
+// CHECK-NEXT:    ret <16 x half> [[TMP6]]
+//
 __m256h test_mm512_maskz_cvt_roundepi32_ph(__mmask16 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi32_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32
   return _mm512_maskz_cvt_roundepi32_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_cvtepi32_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = sitofp <16 x i32> [[TMP2]] to <16 x half>
+// CHECK-NEXT:    ret <16 x half> [[TMP4]]
+//
 __m256h test_mm512_cvtepi32_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvtepi32_ph
-  // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half>
   return _mm512_cvtepi32_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_mask_cvtepi32_ph(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 32
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x half>, ptr [[__W_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = sitofp <16 x i32> [[TMP4]] to <16 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x half> [[TMP7]], <16 x half> [[TMP5]]
+// CHECK-NEXT:    ret <16 x half> [[TMP9]]
+//
 __m256h test_mm512_mask_cvtepi32_ph(__m256h A, __mmask16 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvtepi32_ph
-  // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half>
   return _mm512_mask_cvtepi32_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_maskz_cvtepi32_ph(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = sitofp <16 x i32> [[TMP3]] to <16 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP5]] to <16 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x half> [[TMP6]], <16 x half> [[TMP4]]
+// CHECK-NEXT:    ret <16 x half> [[TMP8]]
+//
 __m256h test_mm512_maskz_cvtepi32_ph(__mmask16 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtepi32_ph
-  // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half>
   return _mm512_maskz_cvtepi32_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_cvt_roundepu32_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <16 x half> [[TMP2]]
+//
 __m256h test_mm512_cvt_roundepu32_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvt_roundepu32_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32
   return _mm512_cvt_roundepu32_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_mask_cvt_roundepu32_ph(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x half> [[TMP4]], <16 x half> [[TMP2]]
+// CHECK-NEXT:    ret <16 x half> [[TMP6]]
+//
 __m256h test_mm512_mask_cvt_roundepu32_ph(__m256h A, __mmask16 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundepu32_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32
   return _mm512_mask_cvt_roundepu32_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_maskz_cvt_roundepu32_ph(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> [[TMP1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x half> [[TMP4]], <16 x half> [[TMP2]]
+// CHECK-NEXT:    ret <16 x half> [[TMP6]]
+//
 __m256h test_mm512_maskz_cvt_roundepu32_ph(__mmask16 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu32_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32
   return _mm512_maskz_cvt_roundepu32_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_cvtepu32_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = uitofp <16 x i32> [[TMP2]] to <16 x half>
+// CHECK-NEXT:    ret <16 x half> [[TMP4]]
+//
 __m256h test_mm512_cvtepu32_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvtepu32_ph
-  // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half>
   return _mm512_cvtepu32_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_mask_cvtepu32_ph(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 32
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x half>, ptr [[__W_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = uitofp <16 x i32> [[TMP4]] to <16 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x half> [[TMP7]], <16 x half> [[TMP5]]
+// CHECK-NEXT:    ret <16 x half> [[TMP9]]
+//
 __m256h test_mm512_mask_cvtepu32_ph(__m256h A, __mmask16 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvtepu32_ph
-  // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half>
   return _mm512_mask_cvtepu32_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_maskz_cvtepu32_ph(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = uitofp <16 x i32> [[TMP3]] to <16 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP5]] to <16 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x half> [[TMP6]], <16 x half> [[TMP4]]
+// CHECK-NEXT:    ret <16 x half> [[TMP8]]
+//
 __m256h test_mm512_maskz_cvtepu32_ph(__mmask16 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtepu32_ph
-  // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half>
   return _mm512_maskz_cvtepu32_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtt_roundph_epi32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP1]], i16 -1, i32 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvtt_roundph_epi32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvtt_roundph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
   return _mm512_cvtt_roundph_epi32(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtt_roundph_epi32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvtt_roundph_epi32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
   return _mm512_mask_cvtt_roundph_epi32(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtt_roundph_epi32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvtt_roundph_epi32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
   return _mm512_maskz_cvtt_roundph_epi32(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvttph_epi32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> [[TMP1]], <16 x i32> [[TMP3]], i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvttph_epi32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvttph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
   return _mm512_cvttph_epi32(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvttph_epi32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <16 x i32>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> [[TMP3]], <16 x i32> [[TMP5]], i16 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvttph_epi32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvttph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
   return _mm512_mask_cvttph_epi32(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvttph_epi32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> [[TMP2]], <16 x i32> [[TMP4]], i16 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvttph_epi32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvttph_epi32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
   return _mm512_maskz_cvttph_epi32(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtt_roundph_epu32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i64> zeroinitializer to <16 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP1]], i16 -1, i32 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvtt_roundph_epu32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvtt_roundph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
   return _mm512_cvtt_roundph_epu32(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtt_roundph_epu32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_mask_cvtt_roundph_epu32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
   return _mm512_mask_cvtt_roundph_epu32(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtt_roundph_epu32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> [[TMP0]], <16 x i32> [[TMP2]], i16 [[TMP3]], i32 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvtt_roundph_epu32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
   return _mm512_maskz_cvtt_roundph_epu32(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvttph_epu32(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> [[TMP1]], <16 x i32> [[TMP3]], i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP4]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_cvttph_epu32(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvttph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
   return _mm512_cvttph_epu32(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvttph_epu32(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <16 x i32>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> [[TMP3]], <16 x i32> [[TMP5]], i16 [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP7]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+//
 __m512i test_mm512_mask_cvttph_epu32(__m512i A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvttph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
   return _mm512_mask_cvttph_epu32(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvttph_epu32(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> [[TMP2]], <16 x i32> [[TMP4]], i16 [[TMP5]], i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP6]] to <8 x i64>
+// CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+//
 __m512i test_mm512_maskz_cvttph_epu32(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvttph_epu32
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
   return _mm512_maskz_cvttph_epu32(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_cvt_roundepi64_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> [[TMP0]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
 __m128h test_mm512_cvt_roundepi64_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvt_roundepi64_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64
   return _mm512_cvt_roundepi64_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_mask_cvt_roundepi64_ph(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> [[TMP0]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x half> [[TMP3]], <8 x half> [[TMP1]]
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm512_mask_cvt_roundepi64_ph(__m128h A, __mmask8 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundepi64_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64
   return _mm512_mask_cvt_roundepi64_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_maskz_cvt_roundepi64_ph(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> [[TMP0]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x half> [[TMP3]], <8 x half> [[TMP1]]
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm512_maskz_cvt_roundepi64_ph(__mmask8 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi64_ph
-  // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64
   return _mm512_maskz_cvt_roundepi64_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_cvtepi64_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = sitofp <8 x i64> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm512_cvtepi64_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvtepi64_ph
-  // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half>
   return _mm512_cvtepi64_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_mask_cvtepi64_ph(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = sitofp <8 x i64> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x half> [[TMP6]], <8 x half> [[TMP4]]
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm512_mask_cvtepi64_ph(__m128h A, __mmask8 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvtepi64_ph
-  // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half>
   return _mm512_mask_cvtepi64_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_maskz_cvtepi64_ph(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = sitofp <8 x i64> [[TMP2]] to <8 x half>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+// CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x half> [[TMP5]], <8 x half> [[TMP3]]
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm512_maskz_cvtepi64_ph(__mmask8 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtepi64_ph
-  // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half>
   return _mm512_maskz_cvtepi64_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvt_roundph_epi64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> [[TMP0]], <8 x i64> zeroinitializer, i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+//
 __m512i test_mm512_cvt_roundph_epi64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvt_roundph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
   return _mm512_cvt_roundph_epi64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvt_roundph_epi64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_mask_cvt_roundph_epi64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
   return _mm512_mask_cvt_roundph_epi64(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvt_roundph_epi64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_maskz_cvt_roundph_epi64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
   return _mm512_maskz_cvt_roundph_epi64(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtph_epi64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> [[TMP1]], <8 x i64> [[TMP2]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvtph_epi64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvtph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
   return _mm512_cvtph_epi64(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtph_epi64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> [[TMP3]], <8 x i64> [[TMP4]], i8 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP6]]
+//
 __m512i test_mm512_mask_cvtph_epi64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
   return _mm512_mask_cvtph_epi64(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtph_epi64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> [[TMP2]], <8 x i64> [[TMP3]], i8 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvtph_epi64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
   return _mm512_maskz_cvtph_epi64(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_cvt_roundepu64_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> [[TMP0]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP1]]
+//
 __m128h test_mm512_cvt_roundepu64_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvt_roundepu64_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64
   return _mm512_cvt_roundepu64_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_mask_cvt_roundepu64_ph(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> [[TMP0]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x half> [[TMP3]], <8 x half> [[TMP1]]
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm512_mask_cvt_roundepu64_ph(__m128h A, __mmask8 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundepu64_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64
   return _mm512_mask_cvt_roundepu64_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_maskz_cvt_roundepu64_ph(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> [[TMP0]], i32 11)
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+// CHECK-NEXT:    [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x half> [[TMP3]], <8 x half> [[TMP1]]
+// CHECK-NEXT:    ret <8 x half> [[TMP5]]
+//
 __m128h test_mm512_maskz_cvt_roundepu64_ph(__mmask8 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu64_ph
-  // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64
   return _mm512_maskz_cvt_roundepu64_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_cvtepu64_ph(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = uitofp <8 x i64> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 __m128h test_mm512_cvtepu64_ph(__m512i A) {
-  // CHECK-LABEL: test_mm512_cvtepu64_ph
-  // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half>
   return _mm512_cvtepu64_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_mask_cvtepu64_ph(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x i64> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = uitofp <8 x i64> [[TMP3]] to <8 x half>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP5]] to <8 x i1>
+// CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x half> [[TMP6]], <8 x half> [[TMP4]]
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm512_mask_cvtepu64_ph(__m128h A, __mmask8 B, __m512i C) {
-  // CHECK-LABEL: test_mm512_mask_cvtepu64_ph
-  // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half>
   return _mm512_mask_cvtepu64_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm512_maskz_cvtepu64_ph(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x i64> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = uitofp <8 x i64> [[TMP2]] to <8 x half>
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+// CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x half> [[TMP5]], <8 x half> [[TMP3]]
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm512_maskz_cvtepu64_ph(__mmask8 A, __m512i B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtepu64_ph
-  // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half>
   return _mm512_maskz_cvtepu64_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvt_roundph_epu64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> [[TMP0]], <8 x i64> zeroinitializer, i8 -1, i32 11)
+// CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+//
 __m512i test_mm512_cvt_roundph_epu64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvt_roundph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
   return _mm512_cvt_roundph_epu64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvt_roundph_epu64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_mask_cvt_roundph_epu64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
   return _mm512_mask_cvt_roundph_epu64(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvt_roundph_epu64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_maskz_cvt_roundph_epu64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
   return _mm512_maskz_cvt_roundph_epu64(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtph_epu64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> [[TMP1]], <8 x i64> [[TMP2]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvtph_epu64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvtph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
   return _mm512_cvtph_epu64(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtph_epu64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> [[TMP3]], <8 x i64> [[TMP4]], i8 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP6]]
+//
 __m512i test_mm512_mask_cvtph_epu64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
   return _mm512_mask_cvtph_epu64(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtph_epu64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> [[TMP2]], <8 x i64> [[TMP3]], i8 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvtph_epu64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
   return _mm512_maskz_cvtph_epu64(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtt_roundph_epi64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.512(<8 x half> [[TMP0]], <8 x i64> zeroinitializer, i8 -1, i32 8)
+// CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+//
 __m512i test_mm512_cvtt_roundph_epi64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvtt_roundph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
   return _mm512_cvtt_roundph_epi64(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtt_roundph_epi64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_mask_cvtt_roundph_epi64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
   return _mm512_mask_cvtt_roundph_epi64(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtt_roundph_epi64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_maskz_cvtt_roundph_epi64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
   return _mm512_maskz_cvtt_roundph_epi64(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvttph_epi64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.512(<8 x half> [[TMP1]], <8 x i64> [[TMP2]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvttph_epi64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvttph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
   return _mm512_cvttph_epi64(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvttph_epi64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.512(<8 x half> [[TMP3]], <8 x i64> [[TMP4]], i8 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP6]]
+//
 __m512i test_mm512_mask_cvttph_epi64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvttph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
   return _mm512_mask_cvttph_epi64(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvttph_epi64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.512(<8 x half> [[TMP2]], <8 x i64> [[TMP3]], i8 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvttph_epi64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvttph_epi64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
   return _mm512_maskz_cvttph_epi64(A, B);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvtt_roundph_epu64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> [[TMP0]], <8 x i64> zeroinitializer, i8 -1, i32 8)
+// CHECK-NEXT:    ret <8 x i64> [[TMP1]]
+//
 __m512i test_mm512_cvtt_roundph_epu64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvtt_roundph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
   return _mm512_cvtt_roundph_epu64(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvtt_roundph_epu64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_mask_cvtt_roundph_epu64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
   return _mm512_mask_cvtt_roundph_epu64(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvtt_roundph_epu64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> [[TMP0]], <8 x i64> [[TMP1]], i8 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_maskz_cvtt_roundph_epu64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
   return _mm512_maskz_cvtt_roundph_epu64(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_cvttph_epu64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> [[TMP1]], <8 x i64> [[TMP2]], i8 -1, i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+//
 __m512i test_mm512_cvttph_epu64(__m128h A) {
-  // CHECK-LABEL: test_mm512_cvttph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
   return _mm512_cvttph_epu64(A);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_mask_cvttph_epu64(
+// CHECK-SAME: <8 x i64> noundef [[A:%.*]], i8 noundef zeroext [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x i64> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i8 [[B]], ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> [[TMP3]], <8 x i64> [[TMP4]], i8 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP6]]
+//
 __m512i test_mm512_mask_cvttph_epu64(__m512i A, __mmask8 B, __m128h C) {
-  // CHECK-LABEL: test_mm512_mask_cvttph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
   return _mm512_mask_cvttph_epu64(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <8 x i64> @test_mm512_maskz_cvttph_epu64(
+// CHECK-SAME: i8 noundef zeroext [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[A]], ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> [[TMP2]], <8 x i64> [[TMP3]], i8 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <8 x i64> [[TMP5]]
+//
 __m512i test_mm512_maskz_cvttph_epu64(__mmask8 A, __m128h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvttph_epu64
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
   return _mm512_maskz_cvttph_epu64(A, B);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvt_roundsh_i32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half> [[TMP0]], i32 11)
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
 int test_mm_cvt_roundsh_i32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvt_roundsh_i32
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si32
   return _mm_cvt_roundsh_i32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvtsh_i32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm_cvtsh_i32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtsh_i32
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si32
   return _mm_cvtsh_i32(A);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvt_roundsh_u32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half> [[TMP0]], i32 11)
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
 unsigned int test_mm_cvt_roundsh_u32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvt_roundsh_u32
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi32
   return _mm_cvt_roundsh_u32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvtsh_u32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 unsigned int test_mm_cvtsh_u32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtsh_u32
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi32
   return _mm_cvtsh_u32(A);
 }
 
 #ifdef __x86_64__
+// CHECK-LABEL: define dso_local i64 @test_mm_cvt_roundsh_i64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half> [[TMP0]], i32 11)
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
 long long test_mm_cvt_roundsh_i64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvt_roundsh_i64
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si64
   return _mm_cvt_roundsh_i64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i64 @test_mm_cvtsh_i64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i64 [[TMP2]]
+//
 long long test_mm_cvtsh_i64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtsh_i64
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si64
   return _mm_cvtsh_i64(A);
 }
 
+// CHECK-LABEL: define dso_local i64 @test_mm_cvt_roundsh_u64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half> [[TMP0]], i32 11)
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
 unsigned long long test_mm_cvt_roundsh_u64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvt_roundsh_u64
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi64
   return _mm_cvt_roundsh_u64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i64 @test_mm_cvtsh_u64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i64 [[TMP2]]
+//
 unsigned long long test_mm_cvtsh_u64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtsh_u64
-  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi64
   return _mm_cvtsh_u64(A);
 }
 #endif
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvt_roundu32_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.vcvtusi2sh(<8 x half> [[TMP0]], i32 [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_cvt_roundu32_sh(__m128h A, unsigned int B) {
-  // CHECK-LABEL: test_mm_cvt_roundu32_sh
-  // CHECK: @llvm.x86.avx512fp16.vcvtusi2sh
   return _mm_cvt_roundu32_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvtu32_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__B_ADDR_I]], align 4
+// CHECK-NEXT:    [[CONV_I:%.*]] = uitofp i32 [[TMP2]] to half
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP3]], half [[CONV_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_cvtu32_sh(__m128h A, unsigned int B) {
-  // CHECK-LABEL: test_mm_cvtu32_sh
-  // CHECK: %{{.*}} = uitofp i32 %{{.*}} to half
   return _mm_cvtu32_sh(A, B);
 }
 
 #ifdef __x86_64__
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvt_roundu64_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.vcvtusi642sh(<8 x half> [[TMP0]], i64 [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_cvt_roundu64_sh(__m128h A, unsigned long long B) {
-  // CHECK-LABEL: test_mm_cvt_roundu64_sh
-  // CHECK: @llvm.x86.avx512fp16.vcvtusi642sh
   return _mm_cvt_roundu64_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvtu64_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store i64 [[TMP1]], ptr [[__B_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
+// CHECK-NEXT:    [[CONV_I:%.*]] = uitofp i64 [[TMP2]] to half
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP3]], half [[CONV_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_cvtu64_sh(__m128h A, unsigned long long B) {
-  // CHECK-LABEL: test_mm_cvtu64_sh
-  // CHECK: %{{.*}} = uitofp i64 %{{.*}} to half
   return _mm_cvtu64_sh(A, B);
 }
 #endif
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvt_roundi32_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.vcvtsi2sh(<8 x half> [[TMP0]], i32 [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_cvt_roundi32_sh(__m128h A, int B) {
-  // CHECK-LABEL: test_mm_cvt_roundi32_sh
-  // CHECK: @llvm.x86.avx512fp16.vcvtsi2sh
   return _mm_cvt_roundi32_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvti32_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__B_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__B_ADDR_I]], align 4
+// CHECK-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[TMP2]] to half
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP3]], half [[CONV_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_cvti32_sh(__m128h A, int B) {
-  // CHECK-LABEL: test_mm_cvti32_sh
-  // CHECK: %{{.*}} = sitofp i32 %{{.*}} to half
   return _mm_cvti32_sh(A, B);
 }
 
 #ifdef __x86_64__
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvt_roundi64_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.x86.avx512fp16.vcvtsi642sh(<8 x half> [[TMP0]], i64 [[TMP1]], i32 11)
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
 __m128h test_mm_cvt_roundi64_sh(__m128h A, long long B) {
-  // CHECK-LABEL: test_mm_cvt_roundi64_sh
-  // CHECK: @llvm.x86.avx512fp16.vcvtsi642sh
   return _mm_cvt_roundi64_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cvti64_sh(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store i64 [[TMP1]], ptr [[__B_ADDR_I]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__B_ADDR_I]], align 8
+// CHECK-NEXT:    [[CONV_I:%.*]] = sitofp i64 [[TMP2]] to half
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x half> [[TMP3]], half [[CONV_I]], i32 0
+// CHECK-NEXT:    store <8 x half> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 __m128h test_mm_cvti64_sh(__m128h A, long long B) {
-  // CHECK-LABEL: test_mm_cvti64_sh
-  // CHECK: %{{.*}} = sitofp i64 %{{.*}} to half
   return _mm_cvti64_sh(A, B);
 }
 #endif
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvtt_roundsh_i32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half> [[TMP0]], i32 8)
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
 int test_mm_cvtt_roundsh_i32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtt_roundsh_i32
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si32
   return _mm_cvtt_roundsh_i32(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvttsh_i32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm_cvttsh_i32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvttsh_i32
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si32
   return _mm_cvttsh_i32(A);
 }
 
 #ifdef __x86_64__
+// CHECK-LABEL: define dso_local i64 @test_mm_cvtt_roundsh_i64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half> [[TMP0]], i32 8)
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
 long long test_mm_cvtt_roundsh_i64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtt_roundsh_i64
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si64
   return _mm_cvtt_roundsh_i64(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i64 @test_mm_cvttsh_i64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i64 [[TMP2]]
+//
 long long test_mm_cvttsh_i64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvttsh_i64
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si64
   return _mm_cvttsh_i64(A);
 }
 #endif
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvtt_roundsh_u32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half> [[TMP0]], i32 8)
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
 unsigned int test_mm_cvtt_roundsh_u32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtt_roundsh_u32
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi32
   return _mm_cvtt_roundsh_u32(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i32 @test_mm_cvttsh_u32(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 unsigned int test_mm_cvttsh_u32(__m128h A) {
-  // CHECK-LABEL: test_mm_cvttsh_u32
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi32
   return _mm_cvttsh_u32(A);
 }
 
 #ifdef __x86_64__
+// CHECK-LABEL: define dso_local i64 @test_mm_cvtt_roundsh_u64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half> [[TMP0]], i32 8)
+// CHECK-NEXT:    ret i64 [[TMP1]]
+//
 unsigned long long test_mm_cvtt_roundsh_u64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvtt_roundsh_u64
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi64
   return _mm_cvtt_roundsh_u64(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local i64 @test_mm_cvttsh_u64(
+// CHECK-SAME: <8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half> [[TMP1]], i32 4)
+// CHECK-NEXT:    ret i64 [[TMP2]]
+//
 unsigned long long test_mm_cvttsh_u64(__m128h A) {
-  // CHECK-LABEL: test_mm_cvttsh_u64
-  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi64
   return _mm_cvttsh_u64(A);
 }
 #endif
 
+// CHECK-LABEL: define dso_local <16 x float> @test_mm512_cvtx_roundph_ps(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> [[TMP0]], <16 x float> zeroinitializer, i16 -1, i32 8)
+// CHECK-NEXT:    ret <16 x float> [[TMP1]]
+//
 __m512 test_mm512_cvtx_roundph_ps(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvtx_roundph_ps
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
   return _mm512_cvtx_roundph_ps(A, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x float> @test_mm512_mask_cvtx_roundph_ps(
+// CHECK-SAME: <16 x float> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> [[TMP0]], <16 x float> [[TMP1]], i16 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <16 x float> [[TMP3]]
+//
 __m512 test_mm512_mask_cvtx_roundph_ps(__m512 A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtx_roundph_ps
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
   return _mm512_mask_cvtx_roundph_ps(A, B, C, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x float> @test_mm512_maskz_cvtx_roundph_ps(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store <16 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> [[TMP0]], <16 x float> [[TMP1]], i16 [[TMP2]], i32 8)
+// CHECK-NEXT:    ret <16 x float> [[TMP3]]
+//
 __m512 test_mm512_maskz_cvtx_roundph_ps(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtx_roundph_ps
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
   return _mm512_maskz_cvtx_roundph_ps(A, B, _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x float> @test_mm512_cvtxph_ps(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <16 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> [[TMP1]], <16 x float> [[TMP2]], i16 -1, i32 4)
+// CHECK-NEXT:    ret <16 x float> [[TMP3]]
+//
 __m512 test_mm512_cvtxph_ps(__m256h A) {
-  // CHECK-LABEL: test_mm512_cvtxph_ps
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
   return _mm512_cvtxph_ps(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x float> @test_mm512_mask_cvtxph_ps(
+// CHECK-SAME: <16 x float> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[C_ADDR]], align 32
+// CHECK-NEXT:    store <16 x float> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x float>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> [[TMP3]], <16 x float> [[TMP4]], i16 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <16 x float> [[TMP6]]
+//
 __m512 test_mm512_mask_cvtxph_ps(__m512 A, __mmask16 B, __m256h C) {
-  // CHECK-LABEL: test_mm512_mask_cvtxph_ps
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
   return _mm512_mask_cvtxph_ps(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <16 x float> @test_mm512_maskz_cvtxph_ps(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[__A_ADDR_I]], align 32
+// CHECK-NEXT:    store <16 x float> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x float>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> [[TMP2]], <16 x float> [[TMP3]], i16 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <16 x float> [[TMP5]]
+//
 __m512 test_mm512_maskz_cvtxph_ps(__mmask16 A, __m256h B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtxph_ps
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
   return _mm512_maskz_cvtxph_ps(A, B);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_cvtx_roundps_ph(
+// CHECK-SAME: <16 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> [[TMP0]], <16 x half> zeroinitializer, i16 -1, i32 11)
+// CHECK-NEXT:    ret <16 x half> [[TMP1]]
+//
 __m256h test_mm512_cvtx_roundps_ph(__m512 A) {
-  // CHECK-LABEL: test_mm512_cvtx_roundps_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
   return _mm512_cvtx_roundps_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_mask_cvtx_roundps_ph(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x float> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> [[TMP0]], <16 x half> [[TMP1]], i16 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <16 x half> [[TMP3]]
+//
 __m256h test_mm512_mask_cvtx_roundps_ph(__m256h A, __mmask16 B, __m512 C) {
-  // CHECK-LABEL: test_mm512_mask_cvtx_roundps_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
   return _mm512_mask_cvtx_roundps_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_maskz_cvtx_roundps_ph(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x float> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> [[TMP0]], <16 x half> [[TMP1]], i16 [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <16 x half> [[TMP3]]
+//
 __m256h test_mm512_maskz_cvtx_roundps_ph(__mmask16 A, __m512 B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtx_roundps_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
   return _mm512_maskz_cvtx_roundps_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_cvtxps_ph(
+// CHECK-SAME: <16 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store <16 x float> [[A]], ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 64
+// CHECK-NEXT:    store <16 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> [[TMP1]], <16 x half> [[TMP2]], i16 -1, i32 4)
+// CHECK-NEXT:    ret <16 x half> [[TMP3]]
+//
 __m256h test_mm512_cvtxps_ph(__m512 A) {
-  // CHECK-LABEL: test_mm512_cvtxps_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
   return _mm512_cvtxps_ph(A);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_mask_cvtxps_ph(
+// CHECK-SAME: <16 x half> noundef [[A:%.*]], i16 noundef zeroext [[B:%.*]], <16 x float> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    store <16 x float> [[C]], ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr [[A_ADDR]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr [[C_ADDR]], align 64
+// CHECK-NEXT:    store <16 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 32
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x float> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x float>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x half>, ptr [[__W_ADDR_I]], align 32
+// CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> [[TMP3]], <16 x half> [[TMP4]], i16 [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <16 x half> [[TMP6]]
+//
 __m256h test_mm512_mask_cvtxps_ph(__m256h A, __mmask16 B, __m512 C) {
-  // CHECK-LABEL: test_mm512_mask_cvtxps_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
   return _mm512_mask_cvtxps_ph(A, B, C);
 }
 
+// CHECK-LABEL: define dso_local <16 x half> @test_mm512_maskz_cvtxps_ph(
+// CHECK-SAME: i16 noundef zeroext [[A:%.*]], <16 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <16 x half>, align 32
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x float>, align 64
+// CHECK-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    store <16 x float> [[B]], ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[B_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <16 x float> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x float>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <16 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 32
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP5:%.*]] = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> [[TMP2]], <16 x half> [[TMP3]], i16 [[TMP4]], i32 4)
+// CHECK-NEXT:    ret <16 x half> [[TMP5]]
+//
 __m256h test_mm512_maskz_cvtxps_ph(__mmask16 A, __m512 B) {
-  // CHECK-LABEL: test_mm512_maskz_cvtxps_ph
-  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
   return _mm512_maskz_cvtxps_ph(A, B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_fmadd_round_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
   return _mm512_fmadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP0]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_fmadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fmadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask3_fmadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmadd_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_fmadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[FNEG]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_fmsub_round_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmsub_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
   return _mm512_fmsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[FNEG]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP0]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_fmsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsub_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fmsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmsub_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[FNEG]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_fmsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsub_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fnmadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[FNEG]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_fnmadd_round_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fnmadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
   return _mm512_fnmadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fnmadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[FNEG]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask3_fnmadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fnmadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fnmadd_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[FNEG]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_fnmadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fnmadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fnmsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG1:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[FNEG]], <32 x half> [[FNEG1]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_fnmsub_round_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fnmsub_round_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
   return _mm512_fnmsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fnmsub_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG1:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[FNEG]], <32 x half> [[TMP1]], <32 x half> [[FNEG1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_fnmsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmsub_round_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fnmsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP3]], <32 x half> [[TMP4]], <32 x half> [[TMP5]])
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fmadd_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_ph
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
   return _mm512_fmadd_ph(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP6]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_ph
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
   return _mm512_mask_fmadd_ph(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP6]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP6]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_ph
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmadd_ph(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmadd_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP6]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_ph
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmadd_ph(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP3]], <32 x half> [[TMP4]], <32 x half> [[FNEG_I]])
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fmsub_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmsub_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
   return _mm512_fmsub_ph(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[FNEG_I]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsub_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fmsub_ph(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmsub_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[FNEG_I]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsub_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmsub_ph(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fnmadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP3]], <32 x half> [[FNEG_I]], <32 x half> [[TMP5]])
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fnmadd_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
   return _mm512_fnmadd_ph(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fnmadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[FNEG_I]], <32 x half> [[TMP5]], <32 x half> [[TMP6]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP6]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmadd_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fnmadd_ph(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fnmadd_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[FNEG_I]], <32 x half> [[TMP5]], <32 x half> [[TMP6]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmadd_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fnmadd_ph(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fnmsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG1_I:%.*]] = fneg <32 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP3]], <32 x half> [[FNEG_I]], <32 x half> [[FNEG1_I]])
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fnmsub_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
   return _mm512_fnmsub_ph(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fnmsub_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG1_I:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[FNEG_I]], <32 x half> [[TMP5]], <32 x half> [[FNEG1_I]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fnmsub_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fnmsub_ph(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmaddsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_fmaddsub_round_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmaddsub_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
   return _mm512_fmaddsub_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmaddsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP0]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_fmaddsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmaddsub_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fmaddsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmaddsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask3_fmaddsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmaddsub_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmaddsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmaddsub_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_fmaddsub_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmaddsub_round_ph
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmaddsub_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmsubadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[FNEG]], i32 11)
+// CHECK-NEXT:    ret <32 x half> [[TMP3]]
+//
 __m512h test_mm512_fmsubadd_round_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmsubadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
   return _mm512_fmsubadd_round_ph(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmsubadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[FNEG]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP0]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_fmsubadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsubadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fmsubadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmsubadd_round_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[FNEG]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_maskz_fmsubadd_round_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsubadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmsubadd_round_ph(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmaddsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP3]], <32 x half> [[TMP4]], <32 x half> [[TMP5]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmaddsub_ph
-  // CHECK-NOT: fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
   return _mm512_fmaddsub_ph(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmaddsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmaddsub_ph
-  // CHECK-NOT: fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fmaddsub_ph(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmaddsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP6]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmaddsub_ph
-  // CHECK-NOT: fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmaddsub_ph(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmaddsub_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP6]], i32 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmaddsub_ph
-  // CHECK-NOT: fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}, i32 4)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmaddsub_ph(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmsubadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP3]], <32 x half> [[TMP4]], <32 x half> [[FNEG_I]], i32 4)
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmsubadd_ph
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
   return _mm512_fmsubadd_ph(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmsubadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[FNEG_I]], i32 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmsubadd_ph
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fmsubadd_ph(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmsubadd_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[FNEG_I]], i32 4)
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> zeroinitializer
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmsubadd_ph
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> zeroinitializer
   return _mm512_maskz_fmsubadd_ph(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP4]], i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP6]], <32 x half> [[TMP5]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_mask3_fmsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsub_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[TMP9]], <32 x half> [[TMP6]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsub_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmsub_ph(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmsubadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP0]], <32 x half> [[TMP1]], <32 x half> [[TMP4]], i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP6]], <32 x half> [[TMP5]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_mask3_fmsubadd_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsubadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmaddsub.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmsubadd_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmsubadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> [[TMP4]], <32 x half> [[TMP5]], <32 x half> [[TMP8]], i32 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[TMP9]], <32 x half> [[TMP6]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmsubadd_ph
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: call <32 x half> @llvm.x86.avx512fp16.vfmaddsub.ph.512(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> [[NEG]], i32 4)
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fmsubadd_ph(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fnmadd_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[FNEG]], <32 x half> [[TMP2]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP0]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_fnmadd_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmadd_round_ph
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fnmadd_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fnmadd_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[FNEG_I]], <32 x half> [[TMP6]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmadd_ph
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fnmadd_ph(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fnmsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG1:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[TMP0]], <32 x half> [[FNEG]], <32 x half> [[FNEG1]], i32 11)
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP6:%.*]] = select <32 x i1> [[TMP5]], <32 x half> [[TMP4]], <32 x half> [[TMP0]]
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mask_fnmsub_round_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmsub_round_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fnmsub_round_ph(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fnmsub_round_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <32 x half> [[TMP0]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = fneg <32 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = call <32 x half> @llvm.x86.avx512fp16.vfmadd.ph.512(<32 x half> [[FNEG]], <32 x half> [[TMP1]], <32 x half> [[TMP4]], i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP6]], <32 x half> [[TMP5]], <32 x half> [[TMP2]]
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_mask3_fnmsub_round_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmsub_round_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: @llvm.x86.avx512fp16.vfmadd.ph.512
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fnmsub_round_ph(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fnmsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i32 noundef [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG1_I:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[TMP4]], <32 x half> [[FNEG_I]], <32 x half> [[FNEG1_I]])
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x half> [[TMP8]], <32 x half> [[TMP4]]
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fnmsub_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_fnmsub_ph(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fnmsub_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i32 noundef [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <32 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = fneg <32 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = call <32 x half> @llvm.fma.v32f16(<32 x half> [[FNEG_I]], <32 x half> [[TMP5]], <32 x half> [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32 [[TMP7]] to <32 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = select <32 x i1> [[TMP10]], <32 x half> [[TMP9]], <32 x half> [[TMP6]]
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fnmsub_ph
-  // CHECK: fneg
-  // CHECK: fneg
-  // CHECK: call <32 x half> @llvm.fma.v32f16(<32 x half> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}})
-  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask3_fnmsub_ph(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmadd_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP3]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = call half @llvm.fma.f16(half [[TMP6]], half [[TMP7]], half [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP9]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP10]]
+//
 __m128h test_mm_fmadd_sh(__m128h __W, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fmadd_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
   return _mm_fmadd_sh(__W, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmadd_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half [[TMP8]]
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask_fmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fmadd_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_mask_fmadd_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmadd_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP3]], half [[TMP4]], half [[TMP5]], i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP6]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fmadd_round_sh(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fmadd_round_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0
   return _mm_fmadd_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmadd_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half [[TMP4]]
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask_fmadd_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fmadd_round_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_mask_fmadd_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmadd_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half 0xH0000
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fmadd_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_maskz_fmadd_sh(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmadd_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half 0xH0000
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_fmadd_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fmadd_round_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_maskz_fmadd_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fmadd_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP3]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half [[TMP10]]
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP6]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fmadd_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fmadd_sh(__W, __X, __Y, __U);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fmadd_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half [[TMP6]]
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask3_fmadd_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fmadd_round_sh
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fmadd_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmsub_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP3]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = call half @llvm.fma.f16(half [[TMP6]], half [[TMP7]], half [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP9]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP10]]
+//
 __m128h test_mm_fmsub_sh(__m128h __W, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fmsub_sh
-  // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
-  // CHECK-NEXT: ret <8 x half> %{{.*}}
   return _mm_fmsub_sh(__W, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmsub_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half [[TMP8]]
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask_fmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fmsub_sh
-  // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
-  // CHECK-NEXT: ret <8 x half> %{{.*}}
   return _mm_mask_fmsub_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmsub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP3]], half [[TMP4]], half [[TMP5]], i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP6]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fmsub_round_sh(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fmsub_round_sh
-  // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11)
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
-  // CHECK-NEXT: ret <8 x half> %{{.*}}
   return _mm_fmsub_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmsub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half [[TMP4]]
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask_fmsub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fmsub_round_sh
-  // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11)
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
-  // CHECK-NEXT: ret <8 x half> %{{.*}}
   return _mm_mask_fmsub_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmsub_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half 0xH0000
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fmsub_sh
-  // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = call half @llvm.fma.f16(half %{{.*}}, half %{{.*}}, half %{{.*}})
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half 0xH0000
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
-  // CHECK-NEXT: ret <8 x half> %{{.*}}
   return _mm_maskz_fmsub_sh(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmsub_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half 0xH0000
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_fmsub_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fmsub_round_sh
-  // CHECK: %{{.*}} = fneg <8 x half> %{{.*}}
-  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = call half @llvm.x86.avx512fp16.vfmadd.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}, i32 11)
-  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half 0xH0000
-  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
-  // CHECK-NEXT: ret <8 x half> %{{.*}}
   return _mm_maskz_fmsub_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fmsub_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP3]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = fneg <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x half> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP12:%.*]] = call half @llvm.fma.f16(half [[TMP9]], half [[TMP10]], half [[TMP11]])
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], half [[TMP12]], half [[TMP13]]
+// CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x half> [[TMP6]], half [[TMP16]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP17]]
+//
 __m128h test_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fmsub_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fmsub_sh(__W, __X, __Y, __U);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fmsub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP1]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP5]], half [[TMP6]], half [[TMP7]], i32 11)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], half [[TMP8]], half [[TMP9]]
+// CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP12]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP13]]
+//
 __m128h test_mm_mask3_fmsub_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fmsub_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fnmadd_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP3]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP5]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = call half @llvm.fma.f16(half [[TMP6]], half [[TMP7]], half [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP9]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP10]]
+//
 __m128h test_mm_fnmadd_sh(__m128h __W, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fnmadd_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
   return _mm_fnmadd_sh(__W, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fnmadd_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half [[TMP8]]
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fnmadd_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_mask_fnmadd_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fnmadd_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP3]], half [[TMP4]], half [[TMP5]], i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP6]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fnmadd_round_sh(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fnmadd_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0
   return _mm_fnmadd_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fnmadd_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half [[TMP4]]
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask_fnmadd_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fnmadd_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_mask_fnmadd_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fnmadd_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half 0xH0000
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fnmadd_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_maskz_fnmadd_sh(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fnmadd_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half 0xH0000
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_fnmadd_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fnmadd_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_maskz_fnmadd_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fnmadd_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP3]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half [[TMP10]]
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP6]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fnmadd_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fnmadd_sh(__W, __X, __Y, __U);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fnmadd_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half [[TMP6]]
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask3_fnmadd_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fnmadd_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fnmadd_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fnmsub_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG1_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[TMP3]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[FNEG1_I]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = call half @llvm.fma.f16(half [[TMP6]], half [[TMP7]], half [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x half> [[TMP3]], half [[TMP9]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP10]]
+//
 __m128h test_mm_fnmsub_sh(__m128h __W, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fnmsub_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
   return _mm_fnmsub_sh(__W, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fnmsub_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG1_I:%.*]] = fneg <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[FNEG1_I]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half [[TMP8]]
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fnmsub_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_mask_fnmsub_sh(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fnmsub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG1:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[FNEG1]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP3]], half [[TMP4]], half [[TMP5]], i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP6]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fnmsub_round_sh(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fnmsub_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[FMA]], i64 0
   return _mm_fnmsub_round_sh(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fnmsub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG1:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[FNEG1]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half [[TMP4]]
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask_fnmsub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fnmsub_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[A]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_mask_fnmsub_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fnmsub_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG1_I:%.*]] = fneg <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[FNEG1_I]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = call half @llvm.fma.f16(half [[TMP8]], half [[TMP9]], half [[TMP10]])
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], half [[TMP11]], half 0xH0000
+// CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x half> [[TMP4]], half [[TMP14]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fnmsub_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_maskz_fnmsub_sh(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fnmsub_round_sh(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG1:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[FNEG1]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP4]], half [[TMP5]], half [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], half [[TMP7]], half 0xH0000
+// CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x half> [[TMP0]], half [[TMP10]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_fnmsub_round_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fnmsub_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> [[ORIGA:%.+]], i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half 0xH0000
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGA]], half [[SEL]], i64 0
   return _mm_maskz_fnmsub_round_sh(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fnmsub_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP3]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__X_ADDR_I]], align 16
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP5]]
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__Y_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP8:%.*]] = fneg <8 x half> [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x half> [[FNEG_I]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x half> [[TMP8]], i64 0
+// CHECK-NEXT:    [[TMP12:%.*]] = call half @llvm.fma.f16(half [[TMP9]], half [[TMP10]], half [[TMP11]])
+// CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x half> [[TMP6]], i64 0
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP7]] to <8 x i1>
+// CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+// CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], half [[TMP12]], half [[TMP13]]
+// CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x half> [[TMP6]], half [[TMP16]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP17]]
+//
 __m128h test_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fnmsub_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.fma.f16(half [[A]], half [[B]], half [[C]])
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fnmsub_sh(__W, __X, __Y, __U);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fnmsub_round_sh(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], <8 x half> noundef [[__X:%.*]], <8 x half> noundef [[__Y:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__X_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__Y_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__X]], ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__Y]], ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__X_ADDR]], align 16
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[TMP1]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__Y_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = fneg <8 x half> [[TMP2]]
+// CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x half> [[TMP0]], i64 0
+// CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x half> [[FNEG]], i64 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x half> [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP8:%.*]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[TMP5]], half [[TMP6]], half [[TMP7]], i32 11)
+// CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x half> [[TMP2]], i64 0
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+// CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], half [[TMP8]], half [[TMP9]]
+// CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP12]], i64 0
+// CHECK-NEXT:    ret <8 x half> [[TMP13]]
+//
 __m128h test_mm_mask3_fnmsub_round_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sh
-  // CHECK: [[NEG:%.+]] = fneg
-  // CHECK: [[NEG2:%.+]] = fneg
-  // CHECK: [[A:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[B:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[C:%.+]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: [[FMA:%.+]] = call half @llvm.x86.avx512fp16.vfmadd.f16(half [[A]], half [[B]], half [[C]], i32 11)
-  // CHECK-NEXT: [[C2:%.+]] = extractelement <8 x half> [[ORIGC:%.+]], i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.+]] = select i1 %{{.*}}, half [[FMA]], half [[C2]]
-  // CHECK-NEXT: insertelement <8 x half> [[ORIGC]], half [[SEL]], i64 0
   return _mm_mask3_fnmsub_round_sh(__W, __X, __Y, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fcmadd_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x float> [[TMP8]], i8 -1, i32 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP10]]
+//
 __m128h test_mm_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fcmadd_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
   return _mm_fcmadd_sch(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fcmadd_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = and i8 [[TMP10]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[EXTRACT_I]], <4 x float> [[TMP11]], <4 x float> [[TMP5]]
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x float> [[TMP14]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_mask_fcmadd_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
-  // CHECK:  %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:  %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fcmadd_sch(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fcmadd_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP12]]
+//
 __m128h test_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fcmadd_sch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.csh
   return _mm_maskz_fcmadd_sch(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fcmadd_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP3]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP12]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP13]]
+//
 __m128h test_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fcmadd_sch
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 4)
-  // CHECK:  %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  // CHECK:  %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
   return _mm_mask3_fcmadd_sch(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fcmadd_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 -1, i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fcmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fcmadd_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
   return _mm_fcmadd_round_sch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fcmadd_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = and i8 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP7]], <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask_fcmadd_round_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_mask_fcmadd_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.csh
-  // CHECK:  %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:  %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fcmadd_round_sch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fcmadd_round_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_maskz_fcmadd_round_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fcmadd_round_sch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.csh
   return _mm_maskz_fcmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fcmadd_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP9]]
+//
 __m128h test_mm_mask3_fcmadd_round_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fcmadd_round_sch
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11)
-  // CHECK:  %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  // CHECK:  %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
   return _mm_mask3_fcmadd_round_sch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmadd_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x float> [[TMP8]], i8 -1, i32 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP10]]
+//
 __m128h test_mm_fmadd_sch(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fmadd_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
   return _mm_fmadd_sch(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmadd_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = and i8 [[TMP10]], 1
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP12]] to <8 x i1>
+// CHECK-NEXT:    [[EXTRACT_I:%.*]] = shufflevector <8 x i1> [[TMP13]], <8 x i1> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[EXTRACT_I]], <4 x float> [[TMP11]], <4 x float> [[TMP5]]
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x float> [[TMP14]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP15]]
+//
 __m128h test_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_mask_fmadd_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
-  // CHECK:  %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:  %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fmadd_sch(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmadd_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP12]]
+//
 __m128h test_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fmadd_sch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.csh
   return _mm_maskz_fmadd_sch(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fmadd_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP3]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__C_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP12]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP13]]
+//
 __m128h test_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fmadd_sch
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 4)
-  // CHECK:  %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  // CHECK:  %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
   return _mm_mask3_fmadd_sch(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmadd_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 -1, i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_fmadd_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
   return _mm_fmadd_round_sch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmadd_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = and i8 [[TMP6]], 1
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP9]], <8 x i1> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP7]], <4 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_mask_fmadd_round_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_mask_fmadd_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.csh
-  // CHECK:  %{{.*}} = shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK:  %{{.*}} = select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_fmadd_round_sch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmadd_round_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_maskz_fmadd_round_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
-  // CHECK-LABEL: @test_mm_maskz_fmadd_round_sch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.csh
   return _mm_maskz_fmadd_round_sch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask3_fmadd_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]], <8 x half> noundef [[__C:%.*]], i8 noundef zeroext [[__U:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__C]], ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__C_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP9]]
+//
 __m128h test_mm_mask3_fmadd_round_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
-  // CHECK-LABEL: @test_mm_mask3_fmadd_round_sch
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <4 x float>
-  // CHECK:  %{{.*}} = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 %{{.*}}, i32 11)
-  // CHECK:  %{{.*}} = shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
-  // CHECK:  %{{.*}} = bitcast <4 x float> %{{.*}} to <8 x half>
   return _mm_mask3_fmadd_round_sch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fcmul_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP3]], <4 x float> [[TMP5]], <4 x float> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fcmul_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fcmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_fcmul_sch(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fcmul_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP12]]
+//
 __m128h test_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fcmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_mask_fcmul_sch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fcmul_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x float> [[TMP8]], i8 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_fcmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_maskz_fcmul_sch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fcmul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> zeroinitializer to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 __m128h test_mm_fcmul_round_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fcmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_fcmul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fcmul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_fcmul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fcmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_mask_fcmul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fcmul_round_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_maskz_fcmul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_fcmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_maskz_fcmul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fcmul_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP3]], <16 x float> [[TMP5]], <16 x float> zeroinitializer, i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP6]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_fcmul_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_fcmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_fcmul_pch(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fcmul_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_fcmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_mask_fcmul_pch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fcmul_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> [[TMP3]] to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x half> [[TMP5]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x half> [[TMP7]] to <16 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x float> [[TMP8]], i16 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP10]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_fcmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_maskz_fcmul_pch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fcmul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> zeroinitializer to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP4]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP5]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fcmul_round_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_fcmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_fcmul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fcmul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask_fcmul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_fcmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_mask_fcmul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fcmul_round_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_fcmul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_fcmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_maskz_fcmul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fcmadd_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> [[TMP3]] to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x half> [[TMP5]] to <16 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x half> [[TMP7]] to <16 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x float> [[TMP8]], i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP9]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fcmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
   return _mm512_fcmadd_pch(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fcmadd_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP10]] to <16 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP11]], <16 x float> [[TMP5]]
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP13]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP14]]
+//
 __m512h test_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fcmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
-  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fcmadd_pch(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fcmadd_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i16 noundef zeroext [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fcmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
-  // CHECK-NOT:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fcmadd_pch(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fcmadd_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fcmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512
   return _mm512_maskz_fcmadd_pch(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fcmadd_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP6]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_fcmadd_round_pch(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fcmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
   return _mm512_fcmadd_round_pch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fcmadd_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP7]], <16 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP9]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fcmadd_round_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fcmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
-  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fcmadd_round_pch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fcmadd_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i16 noundef zeroext [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask3_fcmadd_round_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fcmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmadd.cph.512
-  // CHECK-NOT:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fcmadd_round_pch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fcmadd_round_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_fcmadd_round_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fcmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512
   return _mm512_maskz_fcmadd_round_pch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmul_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP3]], <16 x float> [[TMP5]], <16 x float> zeroinitializer, i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP6]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_fmul_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_fmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_fmul_pch(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmul_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_fmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_mask_fmul_pch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmul_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> [[TMP3]] to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x half> [[TMP5]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x half> [[TMP7]] to <16 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x float> [[TMP8]], i16 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP10]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_fmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_maskz_fmul_pch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> zeroinitializer to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP4]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP5]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_fmul_round_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_fmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_fmul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask_fmul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_fmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_mask_fmul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmul_round_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_fmul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_fmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_maskz_fmul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmadd_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> [[TMP3]] to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x half> [[TMP5]] to <16 x float>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x half> [[TMP7]] to <16 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x float> [[TMP8]], i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP9]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_fmadd_pch(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
   return _mm512_fmadd_pch(__A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmadd_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP10]] to <16 x i1>
+// CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP11]], <16 x float> [[TMP5]]
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP13]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP14]]
+//
 __m512h test_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
-  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmadd_pch(__A, __U, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmadd_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i16 noundef zeroext [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
-  // CHECK-NOT:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmadd_pch(__A, __B, __C, __U);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmadd_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__C_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_pch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.cph.512
   return _mm512_maskz_fmadd_pch(__U, __A, __B, __C);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_fmadd_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP6]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_fmadd_round_pch(__m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_fmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
   return _mm512_fmadd_round_pch(__A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_fmadd_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1>
+// CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP7]], <16 x float> [[TMP1]]
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP9]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_mask_fmadd_round_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_mask_fmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
-  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask_fmadd_round_pch(__A, __U, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask3_fmadd_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]], i16 noundef zeroext [[__U:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask3_fmadd_round_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
-  // CHECK-LABEL: @test_mm512_mask3_fmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmadd.cph.512
-  // CHECK-NOT:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
   return _mm512_mask3_fmadd_round_pch(__A, __B, __C, __U, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_fmadd_round_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]], <32 x half> noundef [[__C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__C_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__C]], ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__C_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_fmadd_round_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
-  // CHECK-LABEL: @test_mm512_maskz_fmadd_round_pch
-  // CHECK: @llvm.x86.avx512fp16.maskz.vfmadd.cph.512
   return _mm512_maskz_fmadd_round_pch(__U, __A, __B, __C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmul_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP3]], <4 x float> [[TMP5]], <4 x float> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_fmul_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_fmul_sch(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmul_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP12]]
+//
 __m128h test_mm_mask_fmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_mask_fmul_sch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmul_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x float> [[TMP8]], i8 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_fmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_maskz_fmul_sch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_fmul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> zeroinitializer to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 __m128h test_mm_fmul_round_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_fmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_fmul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_fmul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_fmul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_fmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_mask_fmul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_fmul_round_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_maskz_fmul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_fmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_maskz_fmul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local half @test_mm512_reduce_add_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
+//
 _Float16 test_mm512_reduce_add_ph(__m512h __W) {
-  // CHECK-LABEL: @test_mm512_reduce_add_ph
-  // CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}})
   return _mm512_reduce_add_ph(__W);
 }
 
+// CHECK-LABEL: define dso_local half @test_mm512_reduce_mul_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call reassoc half @llvm.vector.reduce.fmul.v32f16(half 0xH3C00, <32 x half> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
+//
 _Float16 test_mm512_reduce_mul_ph(__m512h __W) {
-  // CHECK-LABEL: @test_mm512_reduce_mul_ph
-  // CHECK: call reassoc half @llvm.vector.reduce.fmul.v32f16(half 0xH3C00, <32 x half> %{{.*}})
   return _mm512_reduce_mul_ph(__W);
 }
 
+// CHECK-LABEL: define dso_local half @test_mm512_reduce_max_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call nnan half @llvm.vector.reduce.fmax.v32f16(<32 x half> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
+//
 _Float16 test_mm512_reduce_max_ph(__m512h __W) {
-  // CHECK-LABEL: @test_mm512_reduce_max_ph
-  // CHECK: call nnan half @llvm.vector.reduce.fmax.v32f16(<32 x half> %{{.*}})
   return _mm512_reduce_max_ph(__W);
 }
 
+// CHECK-LABEL: define dso_local half @test_mm512_reduce_min_ph(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__V_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__V_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = call nnan half @llvm.vector.reduce.fmin.v32f16(<32 x half> [[TMP1]])
+// CHECK-NEXT:    ret half [[TMP2]]
+//
 _Float16 test_mm512_reduce_min_ph(__m512h __W) {
-  // CHECK-LABEL: @test_mm512_reduce_min_ph
-  // CHECK: call nnan half @llvm.vector.reduce.fmin.v32f16(<32 x half> %{{.*}})
   return _mm512_reduce_min_ph(__W);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_blend_ph(
+// CHECK-SAME: i32 noundef [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__W:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i32 [[__U]], ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[__U_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_ADDR_I]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+// CHECK-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP6]], <32 x half> [[TMP4]], <32 x half> [[TMP5]]
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
-  // CHECK-LABEL: @test_mm512_mask_blend_ph
-  // CHECK:  %{{.*}} = bitcast i32 %{{.*}} to <32 x i1>
-  // CHECK:  %{{.*}} = select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
   return _mm512_mask_blend_ph(__U, __A, __W);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_permutex2var_ph(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <8 x i64> noundef [[__I:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__I_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__I_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[__I]], ptr [[__I_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr [[__I_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[__I_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> [[TMP3]] to <32 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr [[__I_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP5]] to <32 x i16>
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x half> [[TMP7]] to <32 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> [[TMP4]], <32 x i16> [[TMP6]], <32 x i16> [[TMP8]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <32 x i16> [[TMP9]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP10]]
+//
 __m512h test_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_permutex2var_ph
-  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16>
-  // CHECK:  %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16>
-  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16>
-  // CHECK:  %{{.*}} = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
-  // CHECK:  %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x half>
   return _mm512_permutex2var_ph(__A, __I, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_permutexvar_epi16(
+// CHECK-SAME: <8 x i64> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x i64>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <8 x i64> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <32 x i16>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP4]] to <32 x i16>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> [[TMP3]], <32 x i16> [[TMP5]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x i16> [[TMP6]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_permutexvar_epi16(__m512i __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_permutexvar_epi16
-  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16>
-  // CHECK:  %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16>
-  // CHECK:  %{{.*}} = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
-  // CHECK:  %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x half>
   return _mm512_permutexvar_ph(__A, __B);
 }
 
 // tests below are for alias intrinsics.
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mul_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP3]], <16 x float> [[TMP5]], <16 x float> zeroinitializer, i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP6]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_mul_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_mul_pch(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_mul_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask_mul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_mask_mul_pch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_mul_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> [[TMP3]] to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x half> [[TMP5]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x half> [[TMP7]] to <16 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x float> [[TMP8]], i16 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP10]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_maskz_mul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_maskz_mul_pch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_cmul_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP3]], <16 x float> [[TMP5]], <16 x float> zeroinitializer, i16 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP6]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP7]]
+//
 __m512h test_mm512_cmul_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_cmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_cmul_pch(__A, __B);
 }
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_cmul_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x half> [[TMP6]] to <16 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <32 x half>, ptr [[__W_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <32 x half> [[TMP8]] to <16 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP5]], <16 x float> [[TMP7]], <16 x float> [[TMP9]], i16 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP11]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP12]]
+//
 __m512h test_mm512_mask_cmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_cmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_mask_cmul_pch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_cmul_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    store <32 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    store <32 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = load <32 x half>, ptr [[__A_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> [[TMP3]] to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <32 x half>, ptr [[__B_ADDR_I]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x half> [[TMP5]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <32 x half> [[TMP7]] to <16 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__U_ADDR_I]], align 2
+// CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x float> [[TMP8]], i16 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP10]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP11]]
+//
 __m512h test_mm512_maskz_cmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_cmul_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_maskz_cmul_pch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mul_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP3]], <4 x float> [[TMP5]], <4 x float> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_mul_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_mul_sch(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_mul_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP12]]
+//
 __m128h test_mm_mask_mul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_mask_mul_sch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_mul_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x float> [[TMP8]], i8 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_mul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_maskz_mul_sch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> zeroinitializer to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 __m128h test_mm_mul_round_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_mul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_mul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_mul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_mul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_mask_mul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_mul_round_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_maskz_mul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_mul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.csh
   return _mm_maskz_mul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> zeroinitializer to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP4]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP5]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_mul_round_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_mul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_mul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask_mul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_mul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_mask_mul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_mul_round_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_mul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_mul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfmul.cph.512
   return _mm512_maskz_mul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_cmul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x half> zeroinitializer to <16 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP4]], i16 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x float> [[TMP5]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP6]]
+//
 __m512h test_mm512_cmul_round_pch(__m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_cmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_cmul_round_pch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_mask_cmul_round_pch(
+// CHECK-SAME: <32 x half> noundef [[__W:%.*]], i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store <32 x half> [[__W]], ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[__W_ADDR]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_mask_cmul_round_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_mask_cmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_mask_cmul_round_pch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <32 x half> @test_mm512_maskz_cmul_round_pch(
+// CHECK-SAME: i16 noundef zeroext [[__U:%.*]], <32 x half> noundef [[__A:%.*]], <32 x half> noundef [[__B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <32 x half>, align 64
+// CHECK-NEXT:    store i16 [[__U]], ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    store <32 x half> [[__A]], ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    store <32 x half> [[__B]], ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP0:%.*]] = load <32 x half>, ptr [[__A_ADDR]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <32 x half> [[TMP0]] to <16 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <32 x half>, ptr [[__B_ADDR]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <32 x half> [[TMP2]] to <16 x float>
+// CHECK-NEXT:    store <32 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <32 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x half> [[TMP4]] to <16 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__U_ADDR]], align 2
+// CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> [[TMP1]], <16 x float> [[TMP3]], <16 x float> [[TMP5]], i16 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP7]] to <32 x half>
+// CHECK-NEXT:    ret <32 x half> [[TMP8]]
+//
 __m512h test_mm512_maskz_cmul_round_pch(__mmask16 __U, __m512h __A, __m512h __B) {
-  // CHECK-LABEL: @test_mm512_maskz_cmul_round_pch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.cph.512
   return _mm512_maskz_cmul_round_pch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cmul_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP3]], <4 x float> [[TMP5]], <4 x float> zeroinitializer, i8 -1, i32 4)
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP7]]
+//
 __m128h test_mm_cmul_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_cmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_cmul_sch(__A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_cmul_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP0]], ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    store i8 [[TMP1]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, ptr [[__W_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP5]], <4 x float> [[TMP7]], <4 x float> [[TMP9]], i8 [[TMP10]], i32 4)
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP12]]
+//
 __m128h test_mm_mask_cmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_cmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_mask_cmul_sch(__W, __U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_cmul_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR_I:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[TMP0]], ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    store <8 x half> [[TMP2]], ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, ptr [[__A_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, ptr [[__B_ADDR_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = load i8, ptr [[__U_ADDR_I]], align 1
+// CHECK-NEXT:    [[TMP10:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x float> [[TMP8]], i8 [[TMP9]], i32 4)
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP11]]
+//
 __m128h test_mm_maskz_cmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_cmul_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_maskz_cmul_sch(__U, __A, __B);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_cmul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> zeroinitializer to <4 x float>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP4]], i8 -1, i32 11)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 __m128h test_mm_cmul_round_sch(__m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_cmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_cmul_round_sch(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_mask_cmul_round_sch(
+// CHECK-SAME: <8 x half> noundef [[__W:%.*]], i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__W_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store <8 x half> [[__W]], ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[__W_ADDR]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_mask_cmul_round_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_cmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_mask_cmul_round_sch(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
 
+// CHECK-LABEL: define dso_local <8 x half> @test_mm_maskz_cmul_round_sch(
+// CHECK-SAME: i8 noundef zeroext [[__U:%.*]], <8 x half> noundef [[__A:%.*]], <8 x half> noundef [[__B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__U_ADDR:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[__A_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__B_ADDR:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    store i8 [[__U]], ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    store <8 x half> [[__A]], ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    store <8 x half> [[__B]], ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[__A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[__B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    store <8 x half> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__U_ADDR]], align 1
+// CHECK-NEXT:    [[TMP7:%.*]] = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]], i8 [[TMP6]], i32 11)
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP8]]
+//
 __m128h test_mm_maskz_cmul_round_sch(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_cmul_round_sch
-  // CHECK: @llvm.x86.avx512fp16.mask.vfcmul.csh
   return _mm_maskz_cmul_round_sch(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
 }
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index aae5cfb8bb1d99..675026eb5bd572 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -flax-vector-conversions=none -fms-extensions -fms-compatibility -ffreestanding %s -triple=x86_64-windows-msvc -target-feature +sse -emit-llvm -o - -Wall -Werror | FileCheck %s
 
@@ -6,864 +7,615 @@
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
 
+//
 __m128 test_mm_add_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_add_ps
-  // CHECK: fadd <4 x float>
   return _mm_add_ps(A, B);
 }
 
+//
 __m128 test_mm_add_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_add_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fadd float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_add_ss(A, B);
 }
 
+//
 __m128 test_mm_and_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_and_ps
-  // CHECK: and <4 x i32>
   return _mm_and_ps(A, B);
 }
 
+//
 __m128 test_mm_andnot_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_andnot_ps
-  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
-  // CHECK: and <4 x i32>
   return _mm_andnot_ps(A, B);
 }
 
+//
 __m128 test_mm_cmp_ps_eq_oq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_eq_oq
-  // CHECK: fcmp oeq <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_EQ_OQ);
 }
 
+//
 __m128 test_mm_cmp_ps_lt_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_lt_os
-  // CHECK: fcmp olt <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_LT_OS);
 }
 
+//
 __m128 test_mm_cmp_ps_le_os(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_le_os
-  // CHECK: fcmp ole <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_LE_OS);
 }
 
+//
 __m128 test_mm_cmp_ps_unord_q(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_unord_q
-  // CHECK: fcmp uno <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_UNORD_Q);
 }
 
+//
 __m128 test_mm_cmp_ps_neq_uq(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_neq_uq
-  // CHECK: fcmp une <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NEQ_UQ);
 }
 
+//
 __m128 test_mm_cmp_ps_nlt_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nlt_us
-  // CHECK: fcmp uge <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NLT_US);
 }
 
+//
 __m128 test_mm_cmp_ps_nle_us(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_nle_us
-  // CHECK: fcmp ugt <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_NLE_US);
 }
 
+//
 __m128 test_mm_cmp_ps_ord_q(__m128 a, __m128 b) {
-  // CHECK-LABEL: test_mm_cmp_ps_ord_q
-  // CHECK: fcmp ord <4 x float> %{{.*}}, %{{.*}}
   return _mm_cmp_ps(a, b, _CMP_ORD_Q);
 }
 
+//
 __m128 test_mm_cmp_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_cmp_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
   return _mm_cmp_ss(A, B, _CMP_ORD_Q);
 }
 
+//
 __m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpeq_ps
-  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpeq_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpeq_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpeq_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0)
   return _mm_cmpeq_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpge_ps
-  // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpge_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpge_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpge_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   return _mm_cmpge_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpgt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpgt_ps
-  // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpgt_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpgt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpgt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   return _mm_cmpgt_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmple_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmple_ps
-  // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmple_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmple_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmple_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
   return _mm_cmple_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmplt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmplt_ps
-  // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmplt_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmplt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmplt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
   return _mm_cmplt_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpneq_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpneq_ps
-  // CHECK:         [[CMP:%.*]] = fcmp une <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpneq_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpneq_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpneq_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
   return _mm_cmpneq_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpnge_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpnge_ps
-  // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpnge_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpnge_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpnge_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   return _mm_cmpnge_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpngt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpngt_ps
-  // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpngt_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpngt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpngt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   return _mm_cmpngt_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpnle_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpnle_ps
-  // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpnle_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpnle_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpnle_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
   return _mm_cmpnle_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpnlt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpnlt_ps
-  // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpnlt_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpnlt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpnlt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
   return _mm_cmpnlt_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpord_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpord_ps
-  // CHECK:         [[CMP:%.*]] = fcmp ord <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpord_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpord_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpord_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
   return _mm_cmpord_ss(__a, __b);
 }
 
+//
 __m128 test_mm_cmpunord_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpunord_ps
-  // CHECK:         [[CMP:%.*]] = fcmp uno <4 x float>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
-  // CHECK-NEXT:    ret <4 x float> [[BC]]
   return _mm_cmpunord_ps(__a, __b);
 }
 
+//
 __m128 test_mm_cmpunord_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: test_mm_cmpunord_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 3)
   return _mm_cmpunord_ss(__a, __b);
 }
 
+//
 int test_mm_comieq_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_comieq_ss
-  // CHECK: call i32 @llvm.x86.sse.comieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_comieq_ss(A, B);
 }
 
+//
 int test_mm_comige_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_comige_ss
-  // CHECK: call i32 @llvm.x86.sse.comige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_comige_ss(A, B);
 }
 
+//
 int test_mm_comigt_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_comigt_ss
-  // CHECK: call i32 @llvm.x86.sse.comigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_comigt_ss(A, B);
 }
 
+//
 int test_mm_comile_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_comile_ss
-  // CHECK: call i32 @llvm.x86.sse.comile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_comile_ss(A, B);
 }
 
+//
 int test_mm_comilt_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_comilt_ss
-  // CHECK: call i32 @llvm.x86.sse.comilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_comilt_ss(A, B);
 }
 
+//
 int test_mm_comineq_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_comineq_ss
-  // CHECK: call i32 @llvm.x86.sse.comineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_comineq_ss(A, B);
 }
 
+//
 int test_mm_cvt_ss2si(__m128 A) {
-  // CHECK-LABEL: test_mm_cvt_ss2si
-  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
   return _mm_cvt_ss2si(A);
 }
 
+//
 __m128 test_mm_cvtsi32_ss(__m128 A, int B) {
-  // CHECK-LABEL: test_mm_cvtsi32_ss
-  // CHECK: sitofp i32 %{{.*}} to float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_cvtsi32_ss(A, B);
 }
 
 #ifdef __x86_64__
+//
 __m128 test_mm_cvtsi64_ss(__m128 A, long long B) {
-  // CHECK-LABEL: test_mm_cvtsi64_ss
-  // CHECK: sitofp i64 %{{.*}} to float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_cvtsi64_ss(A, B);
 }
 #endif
 
+//
 float test_mm_cvtss_f32(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtss_f32
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   return _mm_cvtss_f32(A);
 }
 
+//
 int test_mm_cvtss_si32(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtss_si32
-  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
   return _mm_cvtss_si32(A);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvtss_si64(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtss_si64
-  // CHECK: call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
   return _mm_cvtss_si64(A);
 }
 #endif
 
+//
 int test_mm_cvtt_ss2si(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtt_ss2si
-  // CHECK: call i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}})
   return _mm_cvtt_ss2si(A);
 }
 
+//
 int test_mm_cvttss_si32(__m128 A) {
-  // CHECK-LABEL: test_mm_cvttss_si32
-  // CHECK: call i32 @llvm.x86.sse.cvttss2si(<4 x float> %{{.*}})
   return _mm_cvttss_si32(A);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvttss_si64(__m128 A) {
-  // CHECK-LABEL: test_mm_cvttss_si64
-  // CHECK: call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %{{.*}})
   return _mm_cvttss_si64(A);
 }
 #endif
 
+//
 __m128 test_mm_div_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_div_ps
-  // CHECK: fdiv <4 x float>
   return _mm_div_ps(A, B);
 }
 
+//
 __m128 test_mm_div_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_div_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fdiv float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_div_ss(A, B);
 }
 
+//
 unsigned int test_MM_GET_EXCEPTION_MASK(void) {
-  // CHECK-LABEL: test_MM_GET_EXCEPTION_MASK
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
-  // CHECK: and i32 %{{.*}}, 8064
   return _MM_GET_EXCEPTION_MASK();
 }
 
+//
 unsigned int test_MM_GET_EXCEPTION_STATE(void) {
-  // CHECK-LABEL: test_MM_GET_EXCEPTION_STATE
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
-  // CHECK: and i32 %{{.*}}, 63
   return _MM_GET_EXCEPTION_STATE();
 }
 
+//
 unsigned int test_MM_GET_FLUSH_ZERO_MODE(void) {
-  // CHECK-LABEL: test_MM_GET_FLUSH_ZERO_MODE
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
-  // CHECK: and i32 %{{.*}}, 32768
   return _MM_GET_FLUSH_ZERO_MODE();
 }
 
+//
 unsigned int test_MM_GET_ROUNDING_MODE(void) {
-  // CHECK-LABEL: test_MM_GET_ROUNDING_MODE
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
-  // CHECK: and i32 %{{.*}}, 24576
   return _MM_GET_ROUNDING_MODE();
 }
 
+//
 unsigned int test_mm_getcsr(void) {
-  // CHECK-LABEL: test_mm_getcsr
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr %{{.*}})
-  // CHECK: load i32
   return _mm_getcsr();
 }
 
+//
 __m128 test_mm_load_ps(float* y) {
-  // CHECK-LABEL: test_mm_load_ps
-  // CHECK: load <4 x float>, ptr {{.*}}, align 16
   return _mm_load_ps(y);
 }
 
+//
 __m128 test_mm_load_ps1(float* y) {
-  // CHECK-LABEL: test_mm_load_ps1
-  // CHECK: load float, ptr %{{.*}}, align 4
-  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
   return _mm_load_ps1(y);
 }
 
+//
 __m128 test_mm_load_ss(float* y) {
-  // CHECK-LABEL: test_mm_load_ss
-  // CHECK: load float, ptr {{.*}}, align 1{{$}}
-  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
-  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
-  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
   return _mm_load_ss(y);
 }
 
+//
 __m128 test_mm_load1_ps(float* y) {
-  // CHECK-LABEL: test_mm_load1_ps
-  // CHECK: load float, ptr %{{.*}}, align 4
-  // CHECK: insertelement <4 x float> poison, float %{{.*}}, i32 0
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
   return _mm_load1_ps(y);
 }
 
+//
 __m128 test_mm_loadh_pi(__m128 x, __m64* y) {
-  // CHECK-LABEL: test_mm_loadh_pi
-  // CHECK: load <2 x float>, ptr {{.*}}, align 1{{$}}
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm_loadh_pi(x,y);
 }
 
+//
 __m128 test_mm_loadl_pi(__m128 x, __m64* y) {
-  // CHECK-LABEL: test_mm_loadl_pi
-  // CHECK: load <2 x float>, ptr {{.*}}, align 1{{$}}
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   return _mm_loadl_pi(x,y);
 }
 
+//
 __m128 test_mm_loadr_ps(float* A) {
-  // CHECK-LABEL: test_mm_loadr_ps
-  // CHECK: load <4 x float>, ptr %{{.*}}, align 16
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   return _mm_loadr_ps(A);
 }
 
+//
 __m128 test_mm_loadu_ps(float* A) {
-  // CHECK-LABEL: test_mm_loadu_ps
-  // CHECK: load <4 x float>, ptr %{{.*}}, align 1{{$}}
   return _mm_loadu_ps(A);
 }
 
+//
 __m128 test_mm_max_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_max_ps
-  // CHECK: @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_max_ps(A, B);
 }
 
+//
 __m128 test_mm_max_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_max_ss
-  // CHECK: @llvm.x86.sse.max.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_max_ss(A, B);
 }
 
+//
 __m128 test_mm_min_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_min_ps
-  // CHECK: @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_min_ps(A, B);
 }
 
+//
 __m128 test_mm_min_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_min_ss
-  // CHECK: @llvm.x86.sse.min.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_min_ss(A, B);
 }
 
+//
 __m128 test_mm_move_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_move_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_move_ss(A, B);
 }
 
+//
 __m128 test_mm_movehl_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_movehl_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   return _mm_movehl_ps(A, B);
 }
 
+//
 __m128 test_mm_movelh_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_movelh_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   return _mm_movelh_ps(A, B);
 }
 
+//
 int test_mm_movemask_ps(__m128 A) {
-  // CHECK-LABEL: test_mm_movemask_ps
-  // CHECK: call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %{{.*}})
   return _mm_movemask_ps(A);
 }
 
+//
 __m128 test_mm_mul_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_mul_ps
-  // CHECK: fmul <4 x float>
   return _mm_mul_ps(A, B);
 }
 
+//
 __m128 test_mm_mul_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_mul_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fmul float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_mul_ss(A, B);
 }
 
+//
 __m128 test_mm_or_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_or_ps
-  // CHECK: or <4 x i32>
   return _mm_or_ps(A, B);
 }
 
+//
 void test_mm_prefetch(char const* p) {
-  // CHECK-LABEL: test_mm_prefetch
-  // CHECK: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 0, i32 1)
   _mm_prefetch(p, 0);
 }
 
+//
 __m128 test_mm_rcp_ps(__m128 x) {
-  // CHECK-LABEL: test_mm_rcp_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> {{.*}})
   return _mm_rcp_ps(x);
 }
 
+//
 __m128 test_mm_rcp_ss(__m128 x) {
-  // CHECK-LABEL: test_mm_rcp_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> {{.*}})
   return _mm_rcp_ss(x);
 }
 
+//
 __m128 test_mm_rsqrt_ps(__m128 x) {
-  // CHECK-LABEL: test_mm_rsqrt_ps
-  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> {{.*}})
   return _mm_rsqrt_ps(x);
 }
 
+//
 __m128 test_mm_rsqrt_ss(__m128 x) {
-  // CHECK-LABEL: test_mm_rsqrt_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> {{.*}})
   return _mm_rsqrt_ss(x);
 }
 
+//
 void test_MM_SET_EXCEPTION_MASK(unsigned int A) {
-  // CHECK-LABEL: test_MM_SET_EXCEPTION_MASK
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
-  // CHECK: load i32
-  // CHECK: and i32 {{.*}}, -8065
-  // CHECK: or i32
-  // CHECK: store i32
-  // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
   _MM_SET_EXCEPTION_MASK(A);
 }
 
+//
 void test_MM_SET_EXCEPTION_STATE(unsigned int A) {
-  // CHECK-LABEL: test_MM_SET_EXCEPTION_STATE
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
-  // CHECK: load i32
-  // CHECK: and i32 {{.*}}, -64
-  // CHECK: or i32
-  // CHECK: store i32
-  // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
   _MM_SET_EXCEPTION_STATE(A);
 }
 
+//
 void test_MM_SET_FLUSH_ZERO_MODE(unsigned int A) {
-  // CHECK-LABEL: test_MM_SET_FLUSH_ZERO_MODE
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
-  // CHECK: load i32
-  // CHECK: and i32 {{.*}}, -32769
-  // CHECK: or i32
-  // CHECK: store i32
-  // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
   _MM_SET_FLUSH_ZERO_MODE(A);
 }
 
+//
 __m128 test_mm_set_ps(float A, float B, float C, float D) {
-  // CHECK-LABEL: test_mm_set_ps
-  // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
   return _mm_set_ps(A, B, C, D);
 }
 
+//
 __m128 test_mm_set_ps1(float A) {
-  // CHECK-LABEL: test_mm_set_ps1
-  // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
   return _mm_set_ps1(A);
 }
 
+//
 void test_MM_SET_ROUNDING_MODE(unsigned int A) {
-  // CHECK-LABEL: test_MM_SET_ROUNDING_MODE
-  // CHECK: call void @llvm.x86.sse.stmxcsr(ptr {{.*}})
-  // CHECK: load i32
-  // CHECK: and i32 {{.*}}, -24577
-  // CHECK: or i32
-  // CHECK: store i32
-  // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
   _MM_SET_ROUNDING_MODE(A);
 }
 
+//
 __m128 test_mm_set_ss(float A) {
-  // CHECK-LABEL: test_mm_set_ss
-  // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 3
   return _mm_set_ss(A);
 }
 
+//
 __m128 test_mm_set1_ps(float A) {
-  // CHECK-LABEL: test_mm_set1_ps
-  // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
   return _mm_set1_ps(A);
 }
 
+//
 void test_mm_setcsr(unsigned int A) {
-  // CHECK-LABEL: test_mm_setcsr
-  // CHECK: store i32
-  // CHECK: call void @llvm.x86.sse.ldmxcsr(ptr {{.*}})
   _mm_setcsr(A);
 }
 
+//
 __m128 test_mm_setr_ps(float A, float B, float C, float D) {
-  // CHECK-LABEL: test_mm_setr_ps
-  // CHECK: insertelement <4 x float> poison, float {{.*}}, i32 0
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
   return _mm_setr_ps(A, B, C, D);
 }
 
+//
 __m128 test_mm_setzero_ps(void) {
-  // CHECK-LABEL: test_mm_setzero_ps
-  // CHECK: store <4 x float> zeroinitializer
   return _mm_setzero_ps();
 }
 
+//
 void test_mm_sfence(void) {
-  // CHECK-LABEL: test_mm_sfence
-  // CHECK: call void @llvm.x86.sse.sfence()
   _mm_sfence();
 }
 
+//
 __m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_shuffle_ps
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
   return _mm_shuffle_ps(A, B, 0);
 }
 
+//
 __m128 test_mm_sqrt_ps(__m128 x) {
-  // CHECK-LABEL: test_mm_sqrt_ps
-  // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> {{.*}})
   return _mm_sqrt_ps(x);
 }
 
+//
 __m128 test_mm_sqrt_ss(__m128 x) {
-  // CHECK-LABEL: test_mm_sqrt_ss
-  // CHECK: extractelement <4 x float> {{.*}}, i64 0
-  // CHECK: call float @llvm.sqrt.f32(float {{.*}})
-  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i64 0
   return _mm_sqrt_ss(x);
 }
 
+//
 void test_mm_store_ps(float* x, __m128 y) {
-  // CHECK-LABEL: test_mm_store_ps
-  // CHECK: store <4 x float> %{{.*}}, ptr {{.*}}, align 16
   _mm_store_ps(x, y);
 }
 
+//
 void test_mm_store_ps1(float* x, __m128 y) {
-  // CHECK-LABEL: test_mm_store_ps1
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
-  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16
   _mm_store_ps1(x, y);
 }
 
+//
 void test_mm_store_ss(float* x, __m128 y) {
-  // CHECK-LABEL: test_mm_store_ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: store float %{{.*}}, ptr {{.*}}, align 1{{$}}
   _mm_store_ss(x, y);
 }
 
+//
 void test_mm_store1_ps(float* x, __m128 y) {
-  // CHECK-LABEL: test_mm_store1_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
-  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16
   _mm_store1_ps(x, y);
 }
 
+//
 void test_mm_storeh_pi(__m64* x,  __m128 y) {
-  // CHECK-LABEL: test_mm_storeh_pi
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 2, i32 3>
-  // CHECK: store <2 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm_storeh_pi(x, y);
 }
 
+//
 void test_mm_storel_pi(__m64* x,  __m128 y) {
-  // CHECK-LABEL: test_mm_storel_pi
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 0, i32 1>
-  // CHECK: store <2 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm_storel_pi(x, y);
 }
 
+//
 void test_mm_storer_ps(float* x,  __m128 y) {
-  // CHECK-LABEL: test_mm_storer_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-  // CHECK: store <4 x float> %{{.*}}, ptr {{.*}}, align 16
   _mm_storer_ps(x, y);
 }
 
+//
 void test_mm_storeu_ps(float* x,  __m128 y) {
-  // CHECK-LABEL: test_mm_storeu_ps
-  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm_storeu_ps(x, y);
 }
 
+//
 void test_mm_stream_ps(float*A, __m128 B) {
-  // CHECK-LABEL: test_mm_stream_ps
-  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
   _mm_stream_ps(A, B);
 }
 
+//
 void test_mm_stream_ps_void(void *A, __m128 B) {
-  // CHECK-LABEL: test_mm_stream_ps_void
-  // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
   _mm_stream_ps(A, B);
 }
 
+//
 __m128 test_mm_sub_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_sub_ps
-  // CHECK: fsub <4 x float>
   return _mm_sub_ps(A, B);
 }
 
+//
 __m128 test_mm_sub_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_sub_ss
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fsub float
-  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
   return _mm_sub_ss(A, B);
 }
 
+//
 void test_MM_TRANSPOSE4_PS(__m128 *A, __m128 *B, __m128 *C, __m128 *D) {
-  // CHECK-LABEL: test_MM_TRANSPOSE4_PS
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   _MM_TRANSPOSE4_PS(*A, *B, *C, *D);
 }
 
+//
 int test_mm_ucomieq_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_ucomieq_ss
-  // CHECK: call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_ucomieq_ss(A, B);
 }
 
+//
 int test_mm_ucomige_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_ucomige_ss
-  // CHECK: call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_ucomige_ss(A, B);
 }
 
+//
 int test_mm_ucomigt_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_ucomigt_ss
-  // CHECK: call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_ucomigt_ss(A, B);
 }
 
+//
 int test_mm_ucomile_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_ucomile_ss
-  // CHECK: call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_ucomile_ss(A, B);
 }
 
+//
 int test_mm_ucomilt_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_ucomilt_ss
-  // CHECK: call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_ucomilt_ss(A, B);
 }
 
+//
 int test_mm_ucomineq_ss(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_ucomineq_ss
-  // CHECK: call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_ucomineq_ss(A, B);
 }
 
+//
 __m128 test_mm_undefined_ps(void) {
-  // CHECK-LABEL: test_mm_undefined_ps
-  // CHECK: ret <4 x float> zeroinitializer
   return _mm_undefined_ps();
 }
 
+//
 __m128 test_mm_unpackhi_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_unpackhi_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   return _mm_unpackhi_ps(A, B);
 }
 
+//
 __m128 test_mm_unpacklo_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_unpacklo_ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   return _mm_unpacklo_ps(A, B);
 }
 
+//
 __m128 test_mm_xor_ps(__m128 A, __m128 B) {
-  // CHECK-LABEL: test_mm_xor_ps
-  // CHECK: xor <4 x i32>
   return _mm_xor_ps(A, B);
 }
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 3ca7777e763359..6645b84c9c5dd9 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +sse2 -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X64
 // RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=CHECK,X86
@@ -9,1767 +10,6879 @@
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_add_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[ADD_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_add_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_add_epi8
-  // CHECK: add <16 x i8>
   return _mm_add_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_add_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[ADD_I:%.*]] = add <8 x i16> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[ADD_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_add_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_add_epi16
-  // CHECK: add <8 x i16>
   return _mm_add_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_add_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[ADD_I:%.*]] = add <4 x i32> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[ADD_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_add_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_add_epi32
-  // CHECK: add <4 x i32>
   return _mm_add_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_add_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// X86-NEXT:    ret <2 x i64> [[ADD_I]]
+//
 __m128i test_mm_add_epi64(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_add_epi64
-  // CHECK: add <2 x i64>
   return _mm_add_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_add_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[ADD_I:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <2 x double> [[ADD_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128d test_mm_add_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_add_pd
-  // CHECK: fadd <2 x double>
   return _mm_add_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_add_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// X86-NEXT:    [[ADD_I:%.*]] = fadd double [[VECEXT1_I]], [[VECEXT_I]]
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP4]], double [[ADD_I]], i32 0
+// X86-NEXT:    store <2 x double> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128d test_mm_add_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_add_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fadd double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_add_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_adds_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_adds_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_adds_epi8
-  // CHECK: call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_adds_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_adds_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_adds_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_adds_epi16
-  // CHECK: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_adds_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_adds_epu8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_adds_epu8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_adds_epu8
-  // CHECK-NOT: call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
-  // CHECK: call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_adds_epu8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_adds_epu16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_adds_epu16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_adds_epu16
-  // CHECK-NOT: call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
-  // CHECK: call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_adds_epu16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_and_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[AND_I]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_and_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_and_pd
-  // CHECK: and <2 x i64>
   return _mm_and_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_and_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// X86-NEXT:    ret <2 x i64> [[AND_I]]
+//
 __m128i test_mm_and_si128(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_and_si128
-  // CHECK: and <2 x i64>
   return _mm_and_si128(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_andnot_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[NOT_I:%.*]] = xor <2 x i64> [[TMP3]], <i64 -1, i64 -1>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[NOT_I]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[AND_I]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_andnot_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_andnot_pd
-  // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
-  // CHECK: and <2 x i64>
   return _mm_andnot_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_andnot_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[NOT_I:%.*]] = xor <2 x i64> [[TMP2]], <i64 -1, i64 -1>
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[AND_I:%.*]] = and <2 x i64> [[NOT_I]], [[TMP3]]
+// X86-NEXT:    ret <2 x i64> [[AND_I]]
+//
 __m128i test_mm_andnot_si128(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_andnot_si128
-  // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
-  // CHECK: and <2 x i64>
   return _mm_andnot_si128(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_avg_epu8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_avg_epu8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_avg_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_avg_epu8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_avg_epu16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_avg_epu16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_avg_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_avg_epu16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_bslli_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+// X86-NEXT:    [[PSLLDQ:%.*]] = shufflevector <16 x i8> zeroinitializer, <16 x i8> [[CAST]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+// X86-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSLLDQ]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_bslli_si128(__m128i A) {
-  // CHECK-LABEL: test_mm_bslli_si128
-  // CHECK: shufflevector <16 x i8> zeroinitializer, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
   return _mm_bslli_si128(A, 5);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_bsrli_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+// X86-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+// X86-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_bsrli_si128(__m128i A) {
-  // CHECK-LABEL: test_mm_bsrli_si128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
   return _mm_bsrli_si128(A, 5);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_castpd_ps(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_castpd_ps(__m128d A) {
-  // CHECK-LABEL: test_mm_castpd_ps
   return _mm_castpd_ps(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_castpd_si128(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_castpd_si128(__m128d A) {
-  // CHECK-LABEL: test_mm_castpd_si128
   return _mm_castpd_si128(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_castps_pd(
+// X86-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_castps_pd(__m128 A) {
-  // CHECK-LABEL: test_mm_castps_pd
   return _mm_castps_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_castps_si128(
+// X86-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_castps_si128(__m128 A) {
-  // CHECK-LABEL: test_mm_castps_si128
   return _mm_castps_si128(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_castsi128_pd(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_castsi128_pd(__m128i A) {
-  // CHECK-LABEL: test_mm_castsi128_pd
   return _mm_castsi128_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_castsi128_ps(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_castsi128_ps(__m128i A) {
-  // CHECK-LABEL: test_mm_castsi128_ps
   return _mm_castsi128_ps(A);
 }
 
+//
+// X86-LABEL: define void @test_mm_clflush(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    call void @llvm.x86.sse2.clflush(ptr [[TMP0]])
+// X86-NEXT:    ret void
+//
 void test_mm_clflush(void* A) {
-  // CHECK-LABEL: test_mm_clflush
-  // CHECK: call void @llvm.x86.sse2.clflush(ptr %{{.*}})
   _mm_clflush(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_eq_oq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp oeq <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_eq_oq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_eq_oq
-  // CHECK: fcmp oeq <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_EQ_OQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_lt_os(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_lt_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_lt_os
-  // CHECK: fcmp olt <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_LT_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_le_os(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ole <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_le_os(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_le_os
-  // CHECK: fcmp ole <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_LE_OS);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_unord_q(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uno <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_unord_q(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_unord_q
-  // CHECK: fcmp uno <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_UNORD_Q);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_neq_uq(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp une <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_neq_uq(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_neq_uq
-  // CHECK: fcmp une <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NEQ_UQ);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_nlt_us(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp uge <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_nlt_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nlt_us
-  // CHECK: fcmp uge <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NLT_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_nle_us(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ugt <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_nle_us(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_nle_us
-  // CHECK: fcmp ugt <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_NLE_US);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_pd_ord_q(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = fcmp ord <2 x double> [[TMP0]], [[TMP1]]
+// X86-NEXT:    [[TMP3:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cmp_pd_ord_q(__m128d a, __m128d b) {
-  // CHECK-LABEL: test_mm_cmp_pd_ord_q
-  // CHECK: fcmp ord <2 x double> %{{.*}}, %{{.*}}
   return _mm_cmp_pd(a, b, _CMP_ORD_Q);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmp_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP0]], <2 x double> [[TMP1]], i8 7)
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128d test_mm_cmp_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmp_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
   return _mm_cmp_sd(A, B, _CMP_ORD_Q);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpeq_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[CMP_I:%.*]] = icmp eq <16 x i8> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_cmpeq_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpeq_epi8
-  // CHECK: icmp eq <16 x i8>
   return _mm_cmpeq_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpeq_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[CMP_I:%.*]] = icmp eq <8 x i16> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_cmpeq_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpeq_epi16
-  // CHECK: icmp eq <8 x i16>
   return _mm_cmpeq_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpeq_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i32> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[SEXT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_cmpeq_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpeq_epi32
-  // CHECK: icmp eq <4 x i32>
   return _mm_cmpeq_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpeq_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp oeq <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpeq_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpeq_pd
-  // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpeq_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpeq_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 0)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmpeq_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpeq_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
   return _mm_cmpeq_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpge_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp ole <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpge_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpge_pd
-  // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpge_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpge_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__C_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 2)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[__C_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP8]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP9]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP10]]
+//
 __m128d test_mm_cmpge_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpge_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpge_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpgt_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[CMP_I:%.*]] = icmp sgt <16 x i8> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[SEXT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi8
-  // CHECK: icmp sgt <16 x i8>
   return _mm_cmpgt_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpgt_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[CMP_I:%.*]] = icmp sgt <8 x i16> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[SEXT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_cmpgt_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi16
-  // CHECK: icmp sgt <8 x i16>
   return _mm_cmpgt_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpgt_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[CMP_I:%.*]] = icmp sgt <4 x i32> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[SEXT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi32
-  // CHECK: icmp sgt <4 x i32>
   return _mm_cmpgt_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpgt_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpgt_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpgt_pd
-  // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpgt_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpgt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__C_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 1)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[__C_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP8]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP9]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP10]]
+//
 __m128d test_mm_cmpgt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpgt_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpgt_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmple_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp ole <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmple_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmple_pd
-  // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmple_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmple_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 2)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmple_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmple_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
   return _mm_cmple_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmplt_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[__B_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// X86-NEXT:    [[CMP_I_I:%.*]] = icmp sgt <16 x i8> [[TMP5]], [[TMP7]]
+// X86-NEXT:    [[SEXT_I_I:%.*]] = sext <16 x i1> [[CMP_I_I]] to <16 x i8>
+// X86-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[SEXT_I_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128i test_mm_cmplt_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmplt_epi8
-  // CHECK: icmp sgt <16 x i8>
   return _mm_cmplt_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmplt_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[__B_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <8 x i16>
+// X86-NEXT:    [[CMP_I_I:%.*]] = icmp sgt <8 x i16> [[TMP5]], [[TMP7]]
+// X86-NEXT:    [[SEXT_I_I:%.*]] = sext <8 x i1> [[CMP_I_I]] to <8 x i16>
+// X86-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[SEXT_I_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128i test_mm_cmplt_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmplt_epi16
-  // CHECK: icmp sgt <8 x i16>
   return _mm_cmplt_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmplt_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[__B_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <4 x i32>
+// X86-NEXT:    [[CMP_I_I:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[TMP7]]
+// X86-NEXT:    [[SEXT_I_I:%.*]] = sext <4 x i1> [[CMP_I_I]] to <4 x i32>
+// X86-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[SEXT_I_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128i test_mm_cmplt_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmplt_epi32
-  // CHECK: icmp sgt <4 x i32>
   return _mm_cmplt_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmplt_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmplt_pd
-  // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmplt_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmplt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 1)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmplt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmplt_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
   return _mm_cmplt_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpneq_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp une <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpneq_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpneq_pd
-  // CHECK:         [[CMP:%.*]] = fcmp une <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpneq_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpneq_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 4)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmpneq_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpneq_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
   return _mm_cmpneq_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpnge_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp ugt <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpnge_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpnge_pd
-  // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpnge_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpnge_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__C_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 6)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[__C_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP8]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP9]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP10]]
+//
 __m128d test_mm_cmpnge_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpnge_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpnge_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpngt_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp uge <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpngt_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpngt_pd
-  // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpngt_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpngt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__C_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 5)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[__C_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__C_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP8]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP9]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP10]]
+//
 __m128d test_mm_cmpngt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpngt_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpngt_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpnle_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp ugt <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpnle_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpnle_pd
-  // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpnle_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpnle_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 6)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmpnle_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpnle_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
   return _mm_cmpnle_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpnlt_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp uge <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpnlt_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpnlt_pd
-  // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpnlt_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpnlt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 5)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmpnlt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpnlt_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
   return _mm_cmpnlt_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpord_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp ord <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpord_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpord_pd
-  // CHECK:         [[CMP:%.*]] = fcmp ord <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpord_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpord_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 7)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmpord_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpord_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
   return _mm_cmpord_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpunord_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = fcmp uno <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    [[TMP5:%.*]] = sext <2 x i1> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_cmpunord_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpunord_pd
-  // CHECK:         [[CMP:%.*]] = fcmp uno <2 x double>
-  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
-  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
   return _mm_cmpunord_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cmpunord_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]], i8 3)
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cmpunord_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_cmpunord_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
   return _mm_cmpunord_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_comieq_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comieq_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_comieq_sd
-  // CHECK: call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comieq_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_comige_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.comige.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comige_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_comige_sd
-  // CHECK: call i32 @llvm.x86.sse2.comige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comige_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_comigt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comigt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_comigt_sd
-  // CHECK: call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comigt_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_comile_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.comile.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comile_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_comile_sd
-  // CHECK: call i32 @llvm.x86.sse2.comile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comile_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_comilt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comilt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_comilt_sd
-  // CHECK: call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comilt_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_comineq_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_comineq_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_comineq_sd
-  // CHECK: call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comineq_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtepi32_pd(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <4 x i32>
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <2 x i32> <i32 0, i32 1>
+// X86-NEXT:    [[CONV_I:%.*]] = sitofp <2 x i32> [[SHUFFLE_I]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[CONV_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cvtepi32_pd(__m128i A) {
-  // CHECK-LABEL: test_mm_cvtepi32_pd
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
-  // CHECK: sitofp <2 x i32> %{{.*}} to <2 x double>
   return _mm_cvtepi32_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtepi32_ps(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[CONV_I:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float>
+// X86-NEXT:    store <4 x float> [[CONV_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cvtepi32_ps(__m128i A) {
-  // CHECK-LABEL: test_mm_cvtepi32_ps
-  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x float>
   return _mm_cvtepi32_ps(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtpd_epi32(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_cvtpd_epi32(__m128d A) {
-  // CHECK-LABEL: test_mm_cvtpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %{{.*}})
   return _mm_cvtpd_epi32(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtpd_ps(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> [[TMP1]])
+// X86-NEXT:    store <4 x float> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128 test_mm_cvtpd_ps(__m128d A) {
-  // CHECK-LABEL: test_mm_cvtpd_ps
-  // CHECK: call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %{{.*}})
   return _mm_cvtpd_ps(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtps_epi32(
+// X86-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_cvtps_epi32(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtps_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %{{.*}})
   return _mm_cvtps_epi32(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtps_pd(
+// X86-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <2 x i32> <i32 0, i32 1>
+// X86-NEXT:    [[CONV_I:%.*]] = fpext <2 x float> [[SHUFFLE_I]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[CONV_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_cvtps_pd(__m128 A) {
-  // CHECK-LABEL: test_mm_cvtps_pd
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 0, i32 1>
-  // CHECK: fpext <2 x float> %{{.*}} to <2 x double>
   return _mm_cvtps_pd(A);
 }
 
+//
+// X86-LABEL: define double @test_mm_cvtsd_f64(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
+// X86-NEXT:    ret double [[VECEXT_I]]
+//
 double test_mm_cvtsd_f64(__m128d A) {
-  // CHECK-LABEL: test_mm_cvtsd_f64
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   return _mm_cvtsd_f64(A);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_cvtsd_si32(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> [[TMP1]])
+// X86-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm_cvtsd_si32(__m128d A) {
-  // CHECK-LABEL: test_mm_cvtsd_si32
-  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %{{.*}})
   return _mm_cvtsd_si32(A);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvtsd_si64(__m128d A) {
-  // X64-LABEL: test_mm_cvtsd_si64
-  // X64: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
   return _mm_cvtsd_si64(A);
 }
 #endif
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtsd_ss(
+// X86-SAME: <4 x float> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    store <4 x float> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <4 x float> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128 test_mm_cvtsd_ss(__m128 A, __m128d B) {
-  // CHECK-LABEL: test_mm_cvtsd_ss
-  // CHECK: call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %{{.*}}, <2 x double> %{{.*}})
   return _mm_cvtsd_ss(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_cvtsi128_si32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_I:%.*]] = alloca <4 x i32>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    store <4 x i32> [[TMP2]], ptr [[__B_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr [[__B_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+// X86-NEXT:    ret i32 [[VECEXT_I]]
+//
 int test_mm_cvtsi128_si32(__m128i A) {
-  // CHECK-LABEL: test_mm_cvtsi128_si32
-  // CHECK: extractelement <4 x i32> %{{.*}}, i32 0
   return _mm_cvtsi128_si32(A);
 }
 
+//
+// X86-LABEL: define i64 @test_mm_cvtsi128_si64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+// X86-NEXT:    ret i64 [[VECEXT_I]]
+//
 long long test_mm_cvtsi128_si64(__m128i A) {
-  // CHECK-LABEL: test_mm_cvtsi128_si64
-  // CHECK: extractelement <2 x i64> %{{.*}}, i32 0
   return _mm_cvtsi128_si64(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtsi32_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__B_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__B_ADDR_I]], align 4
+// X86-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[TMP2]] to double
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP3]], double [[CONV_I]], i32 0
+// X86-NEXT:    store <2 x double> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cvtsi32_sd(__m128d A, int B) {
-  // CHECK-LABEL: test_mm_cvtsi32_sd
-  // CHECK: sitofp i32 %{{.*}} to double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvtsi32_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtsi32_si128(
+// X86-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x i32>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 0, i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 0, i32 2
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 0, i32 3
+// X86-NEXT:    store <4 x i32> [[VECINIT3_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_cvtsi32_si128(int A) {
-  // CHECK-LABEL: test_mm_cvtsi32_si128
-  // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 1
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 2
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 3
   return _mm_cvtsi32_si128(A);
 }
 
 #ifdef __x86_64__
+//
 __m128d test_mm_cvtsi64_sd(__m128d A, long long B) {
-  // X64-LABEL: test_mm_cvtsi64_sd
-  // X64: sitofp i64 %{{.*}} to double
-  // X64: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvtsi64_sd(A, B);
 }
 #endif
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtsi64_si128(
+// X86-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_ADDR]], align 8
+// X86-NEXT:    store i64 [[TMP0]], ptr [[__A_ADDR_I]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__A_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i32 0
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 0, i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_cvtsi64_si128(long long A) {
-  // CHECK-LABEL: test_mm_cvtsi64_si128
-  // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
   return _mm_cvtsi64_si128(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvtss_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <4 x float> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+// X86-NEXT:    [[CONV_I:%.*]] = fpext float [[VECEXT_I]] to double
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP3]], double [[CONV_I]], i32 0
+// X86-NEXT:    store <2 x double> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_cvtss_sd(__m128d A, __m128 B) {
-  // CHECK-LABEL: test_mm_cvtss_sd
-  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
-  // CHECK: fpext float %{{.*}} to double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_cvtss_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvttpd_epi32(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_cvttpd_epi32(__m128d A) {
-  // CHECK-LABEL: test_mm_cvttpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %{{.*}})
   return _mm_cvttpd_epi32(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_cvttps_epi32(
+// X86-SAME: <4 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <4 x float>, align 16
+// X86-NEXT:    store <4 x float> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <4 x float> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> [[TMP1]])
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_cvttps_epi32(__m128 A) {
-  // CHECK-LABEL: test_mm_cvttps_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %{{.*}})
   return _mm_cvttps_epi32(A);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_cvttsd_si32(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> [[TMP1]])
+// X86-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm_cvttsd_si32(__m128d A) {
-  // CHECK-LABEL: test_mm_cvttsd_si32
-  // CHECK: call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %{{.*}})
   return _mm_cvttsd_si32(A);
 }
 
 #ifdef __x86_64__
+//
 long long test_mm_cvttsd_si64(__m128d A) {
-  // X64-LABEL: test_mm_cvttsd_si64
-  // X64: call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %{{.*}})
   return _mm_cvttsd_si64(A);
 }
 #endif
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_div_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[DIV_I:%.*]] = fdiv <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <2 x double> [[DIV_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128d test_mm_div_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_div_pd
-  // CHECK: fdiv <2 x double>
   return _mm_div_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_div_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// X86-NEXT:    [[DIV_I:%.*]] = fdiv double [[VECEXT1_I]], [[VECEXT_I]]
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP4]], double [[DIV_I]], i32 0
+// X86-NEXT:    store <2 x double> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128d test_mm_div_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_div_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fdiv double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_div_sd(A, B);
 }
 
 // Lowering to pextrw requires optimization.
+//
+// X86-LABEL: define i32 @test_mm_extract_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16>
+// X86-NEXT:    [[TMP2:%.*]] = extractelement <8 x i16> [[TMP1]], i64 1
+// X86-NEXT:    [[CONV:%.*]] = zext i16 [[TMP2]] to i32
+// X86-NEXT:    ret i32 [[CONV]]
+//
 int test_mm_extract_epi16(__m128i A) {
-  // CHECK-LABEL: test_mm_extract_epi16
-  // CHECK: extractelement <8 x i16> %{{.*}}, {{i32|i64}} 1
-  // CHECK: zext i16 %{{.*}} to i32
   return _mm_extract_epi16(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_insert_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16>
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[CONV:%.*]] = trunc i32 [[TMP2]] to i16
+// X86-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> [[TMP1]], i16 [[CONV]], i64 0
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_insert_epi16(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_insert_epi16
-  // CHECK: insertelement <8 x i16> %{{.*}}, {{i32|i64}} 0
   return _mm_insert_epi16(A, B, 0);
 }
 
+//
+// X86-LABEL: define void @test_mm_lfence(
+// X86-SAME: ) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    call void @llvm.x86.sse2.lfence()
+// X86-NEXT:    ret void
+//
 void test_mm_lfence(void) {
-  // CHECK-LABEL: test_mm_lfence
-  // CHECK: call void @llvm.x86.sse2.lfence()
   _mm_lfence();
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_load_pd(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 16
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_load_pd(double const* A) {
-  // CHECK-LABEL: test_mm_load_pd
-  // CHECK: load <2 x double>, ptr %{{.*}}, align 16
   return _mm_load_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_load_pd1(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1
+// X86-NEXT:    store double [[TMP2]], ptr [[__U_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load double, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[TMP4]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128d test_mm_load_pd1(double const* A) {
-  // CHECK-LABEL: test_mm_load_pd1
-  // CHECK: load double, ptr %{{.*}}, align 8
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_load_pd1(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_load_sd(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1
+// X86-NEXT:    store double [[TMP2]], ptr [[__U_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i32 0
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double 0.000000e+00, i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_load_sd(double const* A) {
-  // CHECK-LABEL: test_mm_load_sd
-  // CHECK: load double, ptr %{{.*}}, align 1{{$}}
   return _mm_load_sd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_load_si128(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_load_si128(__m128i const* A) {
-  // CHECK-LABEL: test_mm_load_si128
-  // CHECK: load <2 x i64>, ptr %{{.*}}, align 16
   return _mm_load_si128(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_load1_pd(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 1
+// X86-NEXT:    store double [[TMP2]], ptr [[__U_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load double, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[TMP4]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128d test_mm_load1_pd(double const* A) {
-  // CHECK-LABEL: test_mm_load1_pd
-  // CHECK: load double, ptr %{{.*}}, align 8
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_load1_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadh_pd(
+// X86-SAME: <2 x double> noundef [[X:%.*]], ptr noundef [[Y:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[X]], ptr [[X_ADDR]], align 16
+// X86-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[X_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[Y_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 1
+// X86-NEXT:    store double [[TMP3]], ptr [[__U_I]], align 8
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// X86-NEXT:    [[TMP5:%.*]] = load double, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[TMP5]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_loadh_pd(__m128d x, void* y) {
-  // CHECK-LABEL: test_mm_loadh_pd
-  // CHECK: load double, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_loadh_pd(x, y);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadl_epi64(
+// X86-SAME: ptr noundef [[Y:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 1
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 0, i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_loadl_epi64(__m128i* y) {
-  // CHECK: test_mm_loadl_epi64
-  // CHECK: load i64, ptr {{.*}}, align 1{{$}}
-  // CHECK: insertelement <2 x i64> poison, i64 {{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> {{.*}}, i64 0, i32 1
   return _mm_loadl_epi64(y);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadl_pd(
+// X86-SAME: <2 x double> noundef [[X:%.*]], ptr noundef [[Y:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[X]], ptr [[X_ADDR]], align 16
+// X86-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[X_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[Y_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store ptr [[TMP1]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 1
+// X86-NEXT:    store double [[TMP3]], ptr [[__U_I]], align 8
+// X86-NEXT:    [[TMP4:%.*]] = load double, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT_I]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_loadl_pd(__m128d x, void* y) {
-  // CHECK-LABEL: test_mm_loadl_pd
-  // CHECK: load double, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_loadl_pd(x, y);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadr_pd(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 16
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[__U_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__U_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__U_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> <i32 1, i32 0>
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_loadr_pd(double const* A) {
-  // CHECK-LABEL: test_mm_loadr_pd
-  // CHECK: load <2 x double>, ptr %{{.*}}, align 16
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 0>
   return _mm_loadr_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadu_pd(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[TMP1]], align 1
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_loadu_pd(double const* A) {
-  // CHECK-LABEL: test_mm_loadu_pd
-  // CHECK: load <2 x double>, ptr %{{.*}}, align 1{{$}}
   return _mm_loadu_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadu_si128(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 1
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_loadu_si128(__m128i const* A) {
-  // CHECK-LABEL: test_mm_loadu_si128
-  // CHECK: load <2 x i64>, ptr %{{.*}}, align 1{{$}}
   return _mm_loadu_si128(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadu_si64(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 1
+// X86-NEXT:    store i64 [[TMP2]], ptr [[__U_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[__U_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 0, i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_loadu_si64(void const* A) {
-  // CHECK-LABEL: test_mm_loadu_si64
-  // CHECK: load i64, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
   return _mm_loadu_si64(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadu_si32(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x i32>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 1
+// X86-NEXT:    store i32 [[TMP2]], ptr [[__U_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__U_I]], align 4
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 0, i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 0, i32 2
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 0, i32 3
+// X86-NEXT:    store <4 x i32> [[VECINIT3_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_loadu_si32(void const* A) {
-  // CHECK-LABEL: test_mm_loadu_si32
-  // CHECK: load i32, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 1
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 2
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 3
   return _mm_loadu_si32(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_loadu_si16(
+// X86-SAME: ptr noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__U_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[TMP1]], align 1
+// X86-NEXT:    store i16 [[TMP2]], ptr [[__U_I]], align 2
+// X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[__U_I]], align 2
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i32 0
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 0, i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 0, i32 2
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 0, i32 3
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 0, i32 4
+// X86-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 0, i32 5
+// X86-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 0, i32 6
+// X86-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 0, i32 7
+// X86-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_loadu_si16(void const* A) {
-  // CHECK-LABEL: test_mm_loadu_si16
-  // CHECK: load i16, ptr %{{.*}}, align 1{{$}}
-  // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 4
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
   return _mm_loadu_si16(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_madd_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_madd_epi16
-  // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_madd_epi16(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_maskmoveu_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]], ptr noundef [[C:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__D_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__N_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[C_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__D_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__N_ADDR_I]], align 16
+// X86-NEXT:    store ptr [[TMP2]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__D_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[__N_ADDR_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// X86-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> [[TMP4]], <16 x i8> [[TMP6]], ptr [[TMP7]])
+// X86-NEXT:    ret void
+//
 void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
-  // CHECK-LABEL: test_mm_maskmoveu_si128
-  // CHECK: call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, ptr %{{.*}})
   _mm_maskmoveu_si128(A, B, C);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_max_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[ELT_MAX_I:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[ELT_MAX_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_max_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_max_epi16
-  // CHECK: call <8 x i16> @llvm.smax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_max_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_max_epu8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[ELT_MAX_I:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[ELT_MAX_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_max_epu8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_max_epu8
-  // CHECK: call <16 x i8> @llvm.umax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_max_epu8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_max_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_max_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_max_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_max_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_max_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_max_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_max_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_max_sd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_mfence(
+// X86-SAME: ) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    call void @llvm.x86.sse2.mfence()
+// X86-NEXT:    ret void
+//
 void test_mm_mfence(void) {
-  // CHECK-LABEL: test_mm_mfence
-  // CHECK: call void @llvm.x86.sse2.mfence()
   _mm_mfence();
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_min_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[ELT_MIN_I:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[ELT_MIN_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_min_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_min_epi16
-  // CHECK: call <8 x i16> @llvm.smin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_min_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_min_epu8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[ELT_MIN_I:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[ELT_MIN_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_min_epu8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_min_epu8
-  // CHECK: call <16 x i8> @llvm.umin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_min_epu8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_min_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_min_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_min_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_min_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_min_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_min_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_min_sd
-  // CHECK: call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_min_sd(A, B);
 }
 
+//
+// X86-LABEL: define i64 @test_mm_movepi64_pi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+// X86-NEXT:    [[TMP2:%.*]] = bitcast i64 [[VECEXT_I]] to <1 x i64>
+// X86-NEXT:    store <1 x i64> [[TMP2]], ptr [[RETVAL_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[RETVAL_I]], align 8
+// X86-NEXT:    store i64 [[TMP3]], ptr [[COERCE]], align 8
+// X86-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[COERCE]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP4]], ptr [[RETVAL]], align 8
+// X86-NEXT:    [[TMP5:%.*]] = load i64, ptr [[RETVAL]], align 8
+// X86-NEXT:    ret i64 [[TMP5]]
+//
 __m64 test_mm_movepi64_pi64(__m128i A)
 {
-  // CHECK-LABEL: test_mm_movepi64_pi64
-  // CHECK: [[EXT:%.*]] = extractelement <2 x i64> %1, i32 0
   return _mm_movepi64_pi64(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_movpi64_epi64(
+// X86-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 8
+// X86-NEXT:    [[A1:%.*]] = load <1 x i64>, ptr [[A]], align 8
+// X86-NEXT:    store <1 x i64> [[A1]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP0]], ptr [[COERCE]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[COERCE]], align 8
+// X86-NEXT:    store i64 [[TMP1]], ptr [[__A_I]], align 8
+// X86-NEXT:    [[__A1_I:%.*]] = load <1 x i64>, ptr [[__A_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__A1_I]], ptr [[__A_ADDR_I]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[__A_ADDR_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to i64
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 0, i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_movpi64_epi64(__m64 A)
 {
-  // CHECK-LABEL: test_mm_movpi64_epi64
-  // CHECK: [[CAST:%.*]] = bitcast <1 x i64> %{{.*}} to i64
-  // CHECK: [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[CAST]], i32 0
-  // CHECK: insertelement <2 x i64> [[INS]], i64 0, i32 1
   return _mm_movpi64_epi64(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_move_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP2]], <2 x i32> <i32 0, i32 2>
+// X86-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 __m128i test_mm_move_epi64(__m128i A) {
-  // CHECK-LABEL: test_mm_move_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
   return _mm_move_epi64(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_move_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP3]], double [[VECEXT_I]], i32 0
+// X86-NEXT:    store <2 x double> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_move_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_move_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_move_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_movemask_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
+// X86-NEXT:    [[TMP3:%.*]] = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> [[TMP2]])
+// X86-NEXT:    ret i32 [[TMP3]]
+//
 int test_mm_movemask_epi8(__m128i A) {
-  // CHECK-LABEL: test_mm_movemask_epi8
-  // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
   return _mm_movemask_epi8(A);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_movemask_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> [[TMP1]])
+// X86-NEXT:    ret i32 [[TMP2]]
+//
 int test_mm_movemask_pd(__m128d A) {
-  // CHECK-LABEL: test_mm_movemask_pd
-  // CHECK: call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %{{.*}})
   return _mm_movemask_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_mul_epu32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = and <2 x i64> [[TMP2]], <i64 4294967295, i64 4294967295>
+// X86-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967295, i64 4294967295>
+// X86-NEXT:    [[TMP8:%.*]] = mul <2 x i64> [[TMP6]], [[TMP7]]
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128i test_mm_mul_epu32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_mul_epu32
-  // CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
-  // CHECK: and <2 x i64> %{{.*}}, <i64 4294967295, i64 4294967295>
-  // CHECK: mul <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mul_epu32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_mul_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[MUL_I:%.*]] = fmul <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <2 x double> [[MUL_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128d test_mm_mul_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_mul_pd
-  // CHECK: fmul <2 x double> %{{.*}}, %{{.*}}
   return _mm_mul_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_mul_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// X86-NEXT:    [[MUL_I:%.*]] = fmul double [[VECEXT1_I]], [[VECEXT_I]]
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP4]], double [[MUL_I]], i32 0
+// X86-NEXT:    store <2 x double> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128d test_mm_mul_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_mul_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fmul double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_mul_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_mulhi_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_mulhi_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_mulhi_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_mulhi_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_mulhi_epu16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_mulhi_epu16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_mulhi_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_mulhi_epu16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_mullo_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[MUL_I:%.*]] = mul <8 x i16> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[MUL_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_mullo_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_mullo_epi16
-  // CHECK: mul <8 x i16> %{{.*}}, %{{.*}}
   return _mm_mullo_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_or_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[OR_I:%.*]] = or <2 x i64> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[OR_I]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_or_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_or_pd
-  // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   return _mm_or_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_or_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[OR_I:%.*]] = or <2 x i64> [[TMP2]], [[TMP3]]
+// X86-NEXT:    ret <2 x i64> [[OR_I]]
+//
 __m128i test_mm_or_si128(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_or_si128
-  // CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   return _mm_or_si128(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_packs_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_packs_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_packs_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_packs_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_packs_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP3]], <4 x i32> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_packs_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_packs_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_packs_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_packus_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_packus_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_packus_epi16
-  // CHECK: call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_packus_epi16(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_pause(
+// X86-SAME: ) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    call void @llvm.x86.sse2.pause()
+// X86-NEXT:    ret void
+//
 void test_mm_pause(void) {
-  // CHECK-LABEL: test_mm_pause
-  // CHECK: call void @llvm.x86.sse2.pause()
   return _mm_pause();
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sad_epu8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sad_epu8
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_sad_epu8(A, B);
 }
 
+//
 __m128i test_mm_set_epi8(char A, char B, char C, char D,
                          char E, char F, char G, char H,
                          char I, char J, char K, char L,
                          char M, char N, char O, char P) {
-  // CHECK-LABEL: test_mm_set_epi8
-  // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
   return _mm_set_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set_epi16(
+// X86-SAME: i16 noundef signext [[A:%.*]], i16 noundef signext [[B:%.*]], i16 noundef signext [[C:%.*]], i16 noundef signext [[D:%.*]], i16 noundef signext [[E:%.*]], i16 noundef signext [[F:%.*]], i16 noundef signext [[G:%.*]], i16 noundef signext [[H:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__W7_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W6_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W5_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W4_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W3_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W2_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W1_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W0_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <8 x i16>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[D_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[E_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[F_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[G_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[H_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// X86-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// X86-NEXT:    store i16 [[C]], ptr [[C_ADDR]], align 2
+// X86-NEXT:    store i16 [[D]], ptr [[D_ADDR]], align 2
+// X86-NEXT:    store i16 [[E]], ptr [[E_ADDR]], align 2
+// X86-NEXT:    store i16 [[F]], ptr [[F_ADDR]], align 2
+// X86-NEXT:    store i16 [[G]], ptr [[G_ADDR]], align 2
+// X86-NEXT:    store i16 [[H]], ptr [[H_ADDR]], align 2
+// X86-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[C_ADDR]], align 2
+// X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[D_ADDR]], align 2
+// X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[E_ADDR]], align 2
+// X86-NEXT:    [[TMP5:%.*]] = load i16, ptr [[F_ADDR]], align 2
+// X86-NEXT:    [[TMP6:%.*]] = load i16, ptr [[G_ADDR]], align 2
+// X86-NEXT:    [[TMP7:%.*]] = load i16, ptr [[H_ADDR]], align 2
+// X86-NEXT:    store i16 [[TMP0]], ptr [[__W7_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP1]], ptr [[__W6_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP2]], ptr [[__W5_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP3]], ptr [[__W4_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP4]], ptr [[__W3_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP5]], ptr [[__W2_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP6]], ptr [[__W1_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP7]], ptr [[__W0_ADDR_I]], align 2
+// X86-NEXT:    [[TMP8:%.*]] = load i16, ptr [[__W0_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP8]], i32 0
+// X86-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__W1_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[TMP9]], i32 1
+// X86-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__W2_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[TMP10]], i32 2
+// X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[__W3_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[TMP11]], i32 3
+// X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[__W4_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[TMP12]], i32 4
+// X86-NEXT:    [[TMP13:%.*]] = load i16, ptr [[__W5_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[TMP13]], i32 5
+// X86-NEXT:    [[TMP14:%.*]] = load i16, ptr [[__W6_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[TMP14]], i32 6
+// X86-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__W7_ADDR_I]], align 2
+// X86-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[TMP15]], i32 7
+// X86-NEXT:    store <8 x i16> [[VECINIT7_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP16:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP17:%.*]] = bitcast <8 x i16> [[TMP16]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP17]]
+//
 __m128i test_mm_set_epi16(short A, short B, short C, short D,
                           short E, short F, short G, short H) {
-  // CHECK-LABEL: test_mm_set_epi16
-  // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
   return _mm_set_epi16(A, B, C, D, E, F, G, H);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set_epi32(
+// X86-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__I3_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I0_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <4 x i32>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4
+// X86-NEXT:    store i32 [[D]], ptr [[D_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[D_ADDR]], align 4
+// X86-NEXT:    store i32 [[TMP0]], ptr [[__I3_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__I2_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP2]], ptr [[__I1_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP3]], ptr [[__I0_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__I0_ADDR_I]], align 4
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+// X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__I1_ADDR_I]], align 4
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[TMP5]], i32 1
+// X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__I2_ADDR_I]], align 4
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[TMP6]], i32 2
+// X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__I3_ADDR_I]], align 4
+// X86-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[TMP7]], i32 3
+// X86-NEXT:    store <4 x i32> [[VECINIT3_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128i test_mm_set_epi32(int A, int B, int C, int D) {
-  // CHECK-LABEL: test_mm_set_epi32
-  // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
   return _mm_set_epi32(A, B, C, D);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set_epi64(
+// X86-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__Q1_ADDR_I4:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__Q0_ADDR_I5:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__Q1_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q1_ADDR_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_ADDR_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[B:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE3:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 8
+// X86-NEXT:    [[A1:%.*]] = load <1 x i64>, ptr [[A]], align 8
+// X86-NEXT:    store i64 [[B_COERCE]], ptr [[B]], align 8
+// X86-NEXT:    [[B2:%.*]] = load <1 x i64>, ptr [[B]], align 8
+// X86-NEXT:    store <1 x i64> [[A1]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    store <1 x i64> [[B2]], ptr [[B_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP0]], ptr [[COERCE]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[COERCE]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP1]], ptr [[COERCE3]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[COERCE3]], align 8
+// X86-NEXT:    store i64 [[TMP2]], ptr [[__Q1_I]], align 8
+// X86-NEXT:    [[__Q11_I:%.*]] = load <1 x i64>, ptr [[__Q1_I]], align 8
+// X86-NEXT:    store i64 [[TMP3]], ptr [[__Q0_I]], align 8
+// X86-NEXT:    [[__Q02_I:%.*]] = load <1 x i64>, ptr [[__Q0_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q11_I]], ptr [[__Q1_ADDR_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q02_I]], ptr [[__Q0_ADDR_I]], align 8
+// X86-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[__Q1_ADDR_I]], align 8
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to i64
+// X86-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr [[__Q0_ADDR_I]], align 8
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+// X86-NEXT:    store i64 [[TMP5]], ptr [[__Q1_ADDR_I4]], align 8
+// X86-NEXT:    store i64 [[TMP7]], ptr [[__Q0_ADDR_I5]], align 8
+// X86-NEXT:    [[TMP8:%.*]] = load i64, ptr [[__Q0_ADDR_I5]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP8]], i32 0
+// X86-NEXT:    [[TMP9:%.*]] = load i64, ptr [[__Q1_ADDR_I4]], align 8
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[TMP9]], i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP10]]
+//
 __m128i test_mm_set_epi64(__m64 A, __m64 B) {
-  // CHECK-LABEL: test_mm_set_epi64
-  // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   return _mm_set_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set_epi64x(
+// X86-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__Q1_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__Q0_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B_ADDR]], align 8
+// X86-NEXT:    store i64 [[TMP0]], ptr [[__Q1_ADDR_I]], align 8
+// X86-NEXT:    store i64 [[TMP1]], ptr [[__Q0_ADDR_I]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__Q0_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[__Q1_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[TMP3]], i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_set_epi64x(long long A, long long B) {
-  // CHECK-LABEL: test_mm_set_epi64x
-  // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   return _mm_set_epi64x(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set_pd(
+// X86-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__X_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store double [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[B_ADDR]], align 8
+// X86-NEXT:    store double [[TMP0]], ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    store double [[TMP1]], ptr [[__X_ADDR_I]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[__X_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[TMP3]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_set_pd(double A, double B) {
-  // CHECK-LABEL: test_mm_set_pd
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_set_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set_pd1(
+// X86-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I1:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__W_ADDR_I2:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store double [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A_ADDR]], align 8
+// X86-NEXT:    store double [[TMP0]], ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    store double [[TMP1]], ptr [[__W_ADDR_I2]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[__W_ADDR_I2]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__W_ADDR_I2]], align 8
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[TMP3]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I1]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I1]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_set_pd1(double A) {
-  // CHECK-LABEL: test_mm_set_pd1
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_set_pd1(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set_sd(
+// X86-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store double [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A_ADDR]], align 8
+// X86-NEXT:    store double [[TMP0]], ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double 0.000000e+00, i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_set_sd(double A) {
-  // CHECK-LABEL: test_mm_set_sd
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
   return _mm_set_sd(A);
 }
 
+//
 __m128i test_mm_set1_epi8(char A) {
-  // CHECK-LABEL: test_mm_set1_epi8
-  // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
   return _mm_set1_epi8(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set1_epi16(
+// X86-SAME: i16 noundef signext [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__W7_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W6_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W5_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W4_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W3_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W1_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W0_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i16>, align 16
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// X86-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// X86-NEXT:    store i16 [[TMP0]], ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP5:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP6:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP7:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    [[TMP8:%.*]] = load i16, ptr [[__W_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP1]], ptr [[__W7_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP2]], ptr [[__W6_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP3]], ptr [[__W5_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP4]], ptr [[__W4_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP5]], ptr [[__W3_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP6]], ptr [[__W2_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP7]], ptr [[__W1_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP8]], ptr [[__W0_ADDR_I_I]], align 2
+// X86-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__W0_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP9]], i32 0
+// X86-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__W1_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i16> [[VECINIT_I_I]], i16 [[TMP10]], i32 1
+// X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[__W2_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I_I]], i16 [[TMP11]], i32 2
+// X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[__W3_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I]], i16 [[TMP12]], i32 3
+// X86-NEXT:    [[TMP13:%.*]] = load i16, ptr [[__W4_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I_I]], i16 [[TMP13]], i32 4
+// X86-NEXT:    [[TMP14:%.*]] = load i16, ptr [[__W5_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I_I]], i16 [[TMP14]], i32 5
+// X86-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__W6_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I_I]], i16 [[TMP15]], i32 6
+// X86-NEXT:    [[TMP16:%.*]] = load i16, ptr [[__W7_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I_I]], i16 [[TMP16]], i32 7
+// X86-NEXT:    store <8 x i16> [[VECINIT7_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP17:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[TMP17]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP18]]
+//
 __m128i test_mm_set1_epi16(short A) {
-  // CHECK-LABEL: test_mm_set1_epi16
-  // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
   return _mm_set1_epi16(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set1_epi32(
+// X86-SAME: i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__I3_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I0_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <4 x i32>, align 16
+// X86-NEXT:    [[__I_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 [[TMP0]], ptr [[__I_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__I_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__I3_ADDR_I_I]], align 4
+// X86-NEXT:    store i32 [[TMP2]], ptr [[__I2_ADDR_I_I]], align 4
+// X86-NEXT:    store i32 [[TMP3]], ptr [[__I1_ADDR_I_I]], align 4
+// X86-NEXT:    store i32 [[TMP4]], ptr [[__I0_ADDR_I_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__I0_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
+// X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__I1_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i32> [[VECINIT_I_I]], i32 [[TMP6]], i32 1
+// X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__I2_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I_I]], i32 [[TMP7]], i32 2
+// X86-NEXT:    [[TMP8:%.*]] = load i32, ptr [[__I3_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I_I]], i32 [[TMP8]], i32 3
+// X86-NEXT:    store <4 x i32> [[VECINIT3_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP10]]
+//
 __m128i test_mm_set1_epi32(int A) {
-  // CHECK-LABEL: test_mm_set1_epi32
-  // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
   return _mm_set1_epi32(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set1_epi64(
+// X86-SAME: i64 noundef [[A_COERCE:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__Q1_ADDR_I3_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__Q0_ADDR_I4_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__Q1_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q1_ADDR_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_ADDR_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q_ADDR_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE2_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 8
+// X86-NEXT:    [[A1:%.*]] = load <1 x i64>, ptr [[A]], align 8
+// X86-NEXT:    store <1 x i64> [[A1]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP0]], ptr [[COERCE]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[COERCE]], align 8
+// X86-NEXT:    store i64 [[TMP1]], ptr [[__Q_I]], align 8
+// X86-NEXT:    [[__Q1_I:%.*]] = load <1 x i64>, ptr [[__Q_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q1_I]], ptr [[__Q_ADDR_I]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load <1 x i64>, ptr [[__Q_ADDR_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load <1 x i64>, ptr [[__Q_ADDR_I]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP2]], ptr [[COERCE_I]], align 8
+// X86-NEXT:    [[TMP4:%.*]] = load i64, ptr [[COERCE_I]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP3]], ptr [[COERCE2_I]], align 8
+// X86-NEXT:    [[TMP5:%.*]] = load i64, ptr [[COERCE2_I]], align 8
+// X86-NEXT:    store i64 [[TMP4]], ptr [[__Q1_I_I]], align 8
+// X86-NEXT:    [[__Q11_I_I:%.*]] = load <1 x i64>, ptr [[__Q1_I_I]], align 8
+// X86-NEXT:    store i64 [[TMP5]], ptr [[__Q0_I_I]], align 8
+// X86-NEXT:    [[__Q02_I_I:%.*]] = load <1 x i64>, ptr [[__Q0_I_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q11_I_I]], ptr [[__Q1_ADDR_I_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q02_I_I]], ptr [[__Q0_ADDR_I_I]], align 8
+// X86-NEXT:    [[TMP6:%.*]] = load <1 x i64>, ptr [[__Q1_ADDR_I_I]], align 8
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to i64
+// X86-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr [[__Q0_ADDR_I_I]], align 8
+// X86-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+// X86-NEXT:    store i64 [[TMP7]], ptr [[__Q1_ADDR_I3_I]], align 8
+// X86-NEXT:    store i64 [[TMP9]], ptr [[__Q0_ADDR_I4_I]], align 8
+// X86-NEXT:    [[TMP10:%.*]] = load i64, ptr [[__Q0_ADDR_I4_I]], align 8
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0
+// X86-NEXT:    [[TMP11:%.*]] = load i64, ptr [[__Q1_ADDR_I3_I]], align 8
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i64> [[VECINIT_I_I]], i64 [[TMP11]], i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP12:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP12]]
+//
 __m128i test_mm_set1_epi64(__m64 A) {
-  // CHECK-LABEL: test_mm_set1_epi64
-  // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   return _mm_set1_epi64(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set1_epi64x(
+// X86-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__Q1_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__Q0_ADDR_I_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__Q_ADDR_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// X86-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A_ADDR]], align 8
+// X86-NEXT:    store i64 [[TMP0]], ptr [[__Q_ADDR_I]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load i64, ptr [[__Q_ADDR_I]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[__Q_ADDR_I]], align 8
+// X86-NEXT:    store i64 [[TMP1]], ptr [[__Q1_ADDR_I_I]], align 8
+// X86-NEXT:    store i64 [[TMP2]], ptr [[__Q0_ADDR_I_I]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[__Q0_ADDR_I_I]], align 8
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load i64, ptr [[__Q1_ADDR_I_I]], align 8
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i64> [[VECINIT_I_I]], i64 [[TMP4]], i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_set1_epi64x(long long A) {
-  // CHECK-LABEL: test_mm_set1_epi64x
-  // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   return _mm_set1_epi64x(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_set1_pd(
+// X86-SAME: double noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store double [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A_ADDR]], align 8
+// X86-NEXT:    store double [[TMP0]], ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[TMP2]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP3]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128d test_mm_set1_pd(double A) {
-  // CHECK-LABEL: test_mm_set1_pd
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_set1_pd(A);
 }
 
+//
 __m128i test_mm_setr_epi8(char A, char B, char C, char D,
                           char E, char F, char G, char H,
                           char I, char J, char K, char L,
                           char M, char N, char O, char P) {
-  // CHECK-LABEL: test_mm_setr_epi8
-  // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
-  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
   return _mm_setr_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_setr_epi16(
+// X86-SAME: i16 noundef signext [[A:%.*]], i16 noundef signext [[B:%.*]], i16 noundef signext [[C:%.*]], i16 noundef signext [[D:%.*]], i16 noundef signext [[E:%.*]], i16 noundef signext [[F:%.*]], i16 noundef signext [[G:%.*]], i16 noundef signext [[H:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__W7_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W6_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W5_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W4_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W3_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W2_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W1_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W0_ADDR_I_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <8 x i16>, align 16
+// X86-NEXT:    [[__W0_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W1_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W2_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W3_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W4_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W5_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W6_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[__W7_ADDR_I:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[D_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[E_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[F_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[G_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    [[H_ADDR:%.*]] = alloca i16, align 2
+// X86-NEXT:    store i16 [[A]], ptr [[A_ADDR]], align 2
+// X86-NEXT:    store i16 [[B]], ptr [[B_ADDR]], align 2
+// X86-NEXT:    store i16 [[C]], ptr [[C_ADDR]], align 2
+// X86-NEXT:    store i16 [[D]], ptr [[D_ADDR]], align 2
+// X86-NEXT:    store i16 [[E]], ptr [[E_ADDR]], align 2
+// X86-NEXT:    store i16 [[F]], ptr [[F_ADDR]], align 2
+// X86-NEXT:    store i16 [[G]], ptr [[G_ADDR]], align 2
+// X86-NEXT:    store i16 [[H]], ptr [[H_ADDR]], align 2
+// X86-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2
+// X86-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B_ADDR]], align 2
+// X86-NEXT:    [[TMP2:%.*]] = load i16, ptr [[C_ADDR]], align 2
+// X86-NEXT:    [[TMP3:%.*]] = load i16, ptr [[D_ADDR]], align 2
+// X86-NEXT:    [[TMP4:%.*]] = load i16, ptr [[E_ADDR]], align 2
+// X86-NEXT:    [[TMP5:%.*]] = load i16, ptr [[F_ADDR]], align 2
+// X86-NEXT:    [[TMP6:%.*]] = load i16, ptr [[G_ADDR]], align 2
+// X86-NEXT:    [[TMP7:%.*]] = load i16, ptr [[H_ADDR]], align 2
+// X86-NEXT:    store i16 [[TMP0]], ptr [[__W0_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP1]], ptr [[__W1_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP2]], ptr [[__W2_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP3]], ptr [[__W3_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP4]], ptr [[__W4_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP5]], ptr [[__W5_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP6]], ptr [[__W6_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP7]], ptr [[__W7_ADDR_I]], align 2
+// X86-NEXT:    [[TMP8:%.*]] = load i16, ptr [[__W7_ADDR_I]], align 2
+// X86-NEXT:    [[TMP9:%.*]] = load i16, ptr [[__W6_ADDR_I]], align 2
+// X86-NEXT:    [[TMP10:%.*]] = load i16, ptr [[__W5_ADDR_I]], align 2
+// X86-NEXT:    [[TMP11:%.*]] = load i16, ptr [[__W4_ADDR_I]], align 2
+// X86-NEXT:    [[TMP12:%.*]] = load i16, ptr [[__W3_ADDR_I]], align 2
+// X86-NEXT:    [[TMP13:%.*]] = load i16, ptr [[__W2_ADDR_I]], align 2
+// X86-NEXT:    [[TMP14:%.*]] = load i16, ptr [[__W1_ADDR_I]], align 2
+// X86-NEXT:    [[TMP15:%.*]] = load i16, ptr [[__W0_ADDR_I]], align 2
+// X86-NEXT:    store i16 [[TMP8]], ptr [[__W7_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP9]], ptr [[__W6_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP10]], ptr [[__W5_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP11]], ptr [[__W4_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP12]], ptr [[__W3_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP13]], ptr [[__W2_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP14]], ptr [[__W1_ADDR_I_I]], align 2
+// X86-NEXT:    store i16 [[TMP15]], ptr [[__W0_ADDR_I_I]], align 2
+// X86-NEXT:    [[TMP16:%.*]] = load i16, ptr [[__W0_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <8 x i16> poison, i16 [[TMP16]], i32 0
+// X86-NEXT:    [[TMP17:%.*]] = load i16, ptr [[__W1_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <8 x i16> [[VECINIT_I_I]], i16 [[TMP17]], i32 1
+// X86-NEXT:    [[TMP18:%.*]] = load i16, ptr [[__W2_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I_I]], i16 [[TMP18]], i32 2
+// X86-NEXT:    [[TMP19:%.*]] = load i16, ptr [[__W3_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I]], i16 [[TMP19]], i32 3
+// X86-NEXT:    [[TMP20:%.*]] = load i16, ptr [[__W4_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT4_I_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I_I]], i16 [[TMP20]], i32 4
+// X86-NEXT:    [[TMP21:%.*]] = load i16, ptr [[__W5_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT5_I_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I_I]], i16 [[TMP21]], i32 5
+// X86-NEXT:    [[TMP22:%.*]] = load i16, ptr [[__W6_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT6_I_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I_I]], i16 [[TMP22]], i32 6
+// X86-NEXT:    [[TMP23:%.*]] = load i16, ptr [[__W7_ADDR_I_I]], align 2
+// X86-NEXT:    [[VECINIT7_I_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I_I]], i16 [[TMP23]], i32 7
+// X86-NEXT:    store <8 x i16> [[VECINIT7_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP24:%.*]] = load <8 x i16>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP25:%.*]] = bitcast <8 x i16> [[TMP24]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP25]]
+//
 __m128i test_mm_setr_epi16(short A, short B, short C, short D,
                            short E, short F, short G, short H) {
-  // CHECK-LABEL: test_mm_setr_epi16
-  // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
   return _mm_setr_epi16(A, B, C, D, E, F, G, H);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_setr_epi32(
+// X86-SAME: i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__I3_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I0_ADDR_I_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <4 x i32>, align 16
+// X86-NEXT:    [[__I0_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I1_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I2_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[__I3_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4
+// X86-NEXT:    store i32 [[D]], ptr [[D_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[D_ADDR]], align 4
+// X86-NEXT:    store i32 [[TMP0]], ptr [[__I0_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__I1_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP2]], ptr [[__I2_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP3]], ptr [[__I3_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__I3_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load i32, ptr [[__I2_ADDR_I]], align 4
+// X86-NEXT:    [[TMP6:%.*]] = load i32, ptr [[__I1_ADDR_I]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load i32, ptr [[__I0_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP4]], ptr [[__I3_ADDR_I_I]], align 4
+// X86-NEXT:    store i32 [[TMP5]], ptr [[__I2_ADDR_I_I]], align 4
+// X86-NEXT:    store i32 [[TMP6]], ptr [[__I1_ADDR_I_I]], align 4
+// X86-NEXT:    store i32 [[TMP7]], ptr [[__I0_ADDR_I_I]], align 4
+// X86-NEXT:    [[TMP8:%.*]] = load i32, ptr [[__I0_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
+// X86-NEXT:    [[TMP9:%.*]] = load i32, ptr [[__I1_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <4 x i32> [[VECINIT_I_I]], i32 [[TMP9]], i32 1
+// X86-NEXT:    [[TMP10:%.*]] = load i32, ptr [[__I2_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT2_I_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I_I]], i32 [[TMP10]], i32 2
+// X86-NEXT:    [[TMP11:%.*]] = load i32, ptr [[__I3_ADDR_I_I]], align 4
+// X86-NEXT:    [[VECINIT3_I_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I_I]], i32 [[TMP11]], i32 3
+// X86-NEXT:    store <4 x i32> [[VECINIT3_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[TMP12]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP13]]
+//
 __m128i test_mm_setr_epi32(int A, int B, int C, int D) {
-  // CHECK-LABEL: test_mm_setr_epi32
-  // CHECK: insertelement <4 x i32> poison, i32 %{{.*}}, i32 0
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
-  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
   return _mm_setr_epi32(A, B, C, D);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_setr_epi64(
+// X86-SAME: i64 noundef [[A_COERCE:%.*]], i64 noundef [[B_COERCE:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__Q1_ADDR_I4_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[__Q0_ADDR_I5_I:%.*]] = alloca i64, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__Q1_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q1_ADDR_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_ADDR_I_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q1_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q0_ADDR_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[__Q1_ADDR_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE3_I:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[B:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    [[COERCE3:%.*]] = alloca <1 x i64>, align 8
+// X86-NEXT:    store i64 [[A_COERCE]], ptr [[A]], align 8
+// X86-NEXT:    [[A1:%.*]] = load <1 x i64>, ptr [[A]], align 8
+// X86-NEXT:    store i64 [[B_COERCE]], ptr [[B]], align 8
+// X86-NEXT:    [[B2:%.*]] = load <1 x i64>, ptr [[B]], align 8
+// X86-NEXT:    store <1 x i64> [[A1]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    store <1 x i64> [[B2]], ptr [[B_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP0]], ptr [[COERCE]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load i64, ptr [[COERCE]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP1]], ptr [[COERCE3]], align 8
+// X86-NEXT:    [[TMP3:%.*]] = load i64, ptr [[COERCE3]], align 8
+// X86-NEXT:    store i64 [[TMP2]], ptr [[__Q0_I]], align 8
+// X86-NEXT:    [[__Q01_I:%.*]] = load <1 x i64>, ptr [[__Q0_I]], align 8
+// X86-NEXT:    store i64 [[TMP3]], ptr [[__Q1_I]], align 8
+// X86-NEXT:    [[__Q12_I:%.*]] = load <1 x i64>, ptr [[__Q1_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q01_I]], ptr [[__Q0_ADDR_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q12_I]], ptr [[__Q1_ADDR_I]], align 8
+// X86-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[__Q1_ADDR_I]], align 8
+// X86-NEXT:    [[TMP5:%.*]] = load <1 x i64>, ptr [[__Q0_ADDR_I]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP4]], ptr [[COERCE_I]], align 8
+// X86-NEXT:    [[TMP6:%.*]] = load i64, ptr [[COERCE_I]], align 8
+// X86-NEXT:    store <1 x i64> [[TMP5]], ptr [[COERCE3_I]], align 8
+// X86-NEXT:    [[TMP7:%.*]] = load i64, ptr [[COERCE3_I]], align 8
+// X86-NEXT:    store i64 [[TMP6]], ptr [[__Q1_I_I]], align 8
+// X86-NEXT:    [[__Q11_I_I:%.*]] = load <1 x i64>, ptr [[__Q1_I_I]], align 8
+// X86-NEXT:    store i64 [[TMP7]], ptr [[__Q0_I_I]], align 8
+// X86-NEXT:    [[__Q02_I_I:%.*]] = load <1 x i64>, ptr [[__Q0_I_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q11_I_I]], ptr [[__Q1_ADDR_I_I]], align 8
+// X86-NEXT:    store <1 x i64> [[__Q02_I_I]], ptr [[__Q0_ADDR_I_I]], align 8
+// X86-NEXT:    [[TMP8:%.*]] = load <1 x i64>, ptr [[__Q1_ADDR_I_I]], align 8
+// X86-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to i64
+// X86-NEXT:    [[TMP10:%.*]] = load <1 x i64>, ptr [[__Q0_ADDR_I_I]], align 8
+// X86-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to i64
+// X86-NEXT:    store i64 [[TMP9]], ptr [[__Q1_ADDR_I4_I]], align 8
+// X86-NEXT:    store i64 [[TMP11]], ptr [[__Q0_ADDR_I5_I]], align 8
+// X86-NEXT:    [[TMP12:%.*]] = load i64, ptr [[__Q0_ADDR_I5_I]], align 8
+// X86-NEXT:    [[VECINIT_I_I:%.*]] = insertelement <2 x i64> poison, i64 [[TMP12]], i32 0
+// X86-NEXT:    [[TMP13:%.*]] = load i64, ptr [[__Q1_ADDR_I4_I]], align 8
+// X86-NEXT:    [[VECINIT1_I_I:%.*]] = insertelement <2 x i64> [[VECINIT_I_I]], i64 [[TMP13]], i32 1
+// X86-NEXT:    store <2 x i64> [[VECINIT1_I_I]], ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP14]]
+//
 __m128i test_mm_setr_epi64(__m64 A, __m64 B) {
-  // CHECK-LABEL: test_mm_setr_epi64
-  // CHECK: insertelement <2 x i64> poison, i64 %{{.*}}, i32 0
-  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
   return _mm_setr_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_setr_pd(
+// X86-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__W_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[__X_ADDR_I:%.*]] = alloca double, align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca double, align 8
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store double [[A]], ptr [[A_ADDR]], align 8
+// X86-NEXT:    store double [[B]], ptr [[B_ADDR]], align 8
+// X86-NEXT:    [[TMP0:%.*]] = load double, ptr [[A_ADDR]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load double, ptr [[B_ADDR]], align 8
+// X86-NEXT:    store double [[TMP0]], ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    store double [[TMP1]], ptr [[__X_ADDR_I]], align 8
+// X86-NEXT:    [[TMP2:%.*]] = load double, ptr [[__W_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load double, ptr [[__X_ADDR_I]], align 8
+// X86-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[TMP3]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT1_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP5]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128d test_mm_setr_pd(double A, double B) {
-  // CHECK-LABEL: test_mm_setr_pd
-  // CHECK: insertelement <2 x double> poison, double %{{.*}}, i32 0
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_setr_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_setzero_pd(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128d test_mm_setzero_pd(void) {
-  // CHECK-LABEL: test_mm_setzero_pd
-  // CHECK: store <2 x double> zeroinitializer
   return _mm_setzero_pd();
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_setzero_si128(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP0]]
+//
 __m128i test_mm_setzero_si128(void) {
-  // CHECK-LABEL: test_mm_setzero_si128
-  // CHECK: store <2 x i64> zeroinitializer
   return _mm_setzero_si128();
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_shuffle_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
+// X86-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[PERMIL]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_shuffle_epi32(__m128i A) {
-  // CHECK-LABEL: test_mm_shuffle_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> zeroinitializer
   return _mm_shuffle_epi32(A, 0);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_shuffle_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[SHUFP:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 2>
+// X86-NEXT:    store <2 x double> [[SHUFP]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_shuffle_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2>
   return _mm_shuffle_pd(A, B, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_shufflehi_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16>
+// X86-NEXT:    [[PSHUFHW:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[PSHUFHW]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_shufflehi_epi16(__m128i A) {
-  // CHECK-LABEL: test_mm_shufflehi_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
   return _mm_shufflehi_epi16(A, 0);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_shufflelo_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <8 x i16>
+// X86-NEXT:    [[PSHUFLW:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[PSHUFLW]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP2]]
+//
 __m128i test_mm_shufflelo_epi16(__m128i A) {
-  // CHECK-LABEL: test_mm_shufflelo_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   return _mm_shufflelo_epi16(A, 0);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sll_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_sll_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sll_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sll_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sll_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> [[TMP3]], <4 x i32> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_sll_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sll_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sll_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sll_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]])
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_sll_epi64(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sll_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sll_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_slli_epi16(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi16(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi16_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_slli_epi16_1(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_epi16_1
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi16(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi16_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> [[TMP3]], i32 [[TMP4]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_slli_epi16_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_slli_epi16_2
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_slli_epi32(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi32(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi32_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_slli_epi32_1(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_epi32_1
-  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi32(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi32_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> [[TMP3]], i32 [[TMP4]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_slli_epi32_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_slli_epi32_2
-  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> [[TMP1]], i32 [[TMP2]])
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_slli_epi64(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi64(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi64_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> [[TMP1]], i32 [[TMP2]])
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_slli_epi64_1(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_epi64_1
-  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi64(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_epi64_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_slli_epi64_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_slli_epi64_2
-  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+// X86-NEXT:    [[PSLLDQ:%.*]] = shufflevector <16 x i8> zeroinitializer, <16 x i8> [[CAST]], <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+// X86-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSLLDQ]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_slli_si128(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_si128
-  // CHECK: shufflevector <16 x i8> zeroinitializer, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
   return _mm_slli_si128(A, 5);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_slli_si128_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    ret <2 x i64> zeroinitializer
+//
 __m128i test_mm_slli_si128_2(__m128i A) {
-  // CHECK-LABEL: test_mm_slli_si128_2
-  // CHECK: ret <2 x i64> zeroinitializer
   return _mm_slli_si128(A, 17);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sqrt_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP3]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128d test_mm_sqrt_pd(__m128d A) {
-  // CHECK-LABEL: test_mm_sqrt_pd
-  // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}})
   return _mm_sqrt_pd(A);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sqrt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__C_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+// X86-NEXT:    [[TMP4:%.*]] = call double @llvm.sqrt.f64(double [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i64 0
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[__C_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__C_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP6]], i32 0
+// X86-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> poison, double [[VECEXT_I]], i32 0
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
+// X86-NEXT:    [[VECINIT2_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[VECEXT1_I]], i32 1
+// X86-NEXT:    store <2 x double> [[VECINIT2_I]], ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[DOTCOMPOUNDLITERAL_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP9]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP10:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP10]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP11:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP11]]
+//
 __m128d test_mm_sqrt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_sqrt_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
-  // CHECK: call double @llvm.sqrt.f64(double {{.*}})
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i64 0
   return _mm_sqrt_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sra_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_sra_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sra_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sra_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sra_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> [[TMP3]], <4 x i32> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_sra_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sra_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sra_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srai_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srai_epi16(__m128i A) {
-  // CHECK-LABEL: test_mm_srai_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi16(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srai_epi16_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srai_epi16_1(__m128i A) {
-  // CHECK-LABEL: test_mm_srai_epi16_1
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi16(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srai_epi16_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> [[TMP3]], i32 [[TMP4]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_srai_epi16_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_srai_epi16_2
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srai_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srai_epi32(__m128i A) {
-  // CHECK-LABEL: test_mm_srai_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi32(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srai_epi32_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srai_epi32_1(__m128i A) {
-  // CHECK-LABEL: test_mm_srai_epi32_1
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi32(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srai_epi32_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> [[TMP3]], i32 [[TMP4]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_srai_epi32_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_srai_epi32_2
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srl_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_srl_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_srl_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_srl_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srl_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> [[TMP3]], <4 x i32> [[TMP5]])
+// X86-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP7]]
+//
 __m128i test_mm_srl_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_srl_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_srl_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srl_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__COUNT_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> [[TMP2]], <2 x i64> [[TMP3]])
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_srl_epi64(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_srl_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_srl_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srli_epi16(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi16(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi16_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srli_epi16_1(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_epi16_1
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi16(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi16_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> [[TMP3]], i32 [[TMP4]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_srli_epi16_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_srli_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srli_epi32(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi32(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi32_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP5]]
+//
 __m128i test_mm_srli_epi32_1(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_epi32_1
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi32(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi32_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> [[TMP3]], i32 [[TMP4]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_srli_epi32_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_srli_epi32_2
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> [[TMP1]], i32 [[TMP2]])
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_srli_epi64(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi64(A, 1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi64_1(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 -1, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> [[TMP1]], i32 [[TMP2]])
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128i test_mm_srli_epi64_1(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_epi64_1
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi64(A, -1);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_epi64_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__COUNT_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__COUNT_ADDR_I]], align 4
+// X86-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> [[TMP2]], i32 [[TMP3]])
+// X86-NEXT:    ret <2 x i64> [[TMP4]]
+//
 __m128i test_mm_srli_epi64_2(__m128i A, int B) {
-  // CHECK-LABEL: test_mm_srli_epi64_2
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[CAST:%.*]] = bitcast <2 x i64> [[TMP0]] to <16 x i8>
+// X86-NEXT:    [[PSRLDQ:%.*]] = shufflevector <16 x i8> [[CAST]], <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+// X86-NEXT:    [[CAST1:%.*]] = bitcast <16 x i8> [[PSRLDQ]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[CAST1]]
+//
 __m128i test_mm_srli_si128(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_si128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
   return _mm_srli_si128(A, 5);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_srli_si128_2(
+// X86-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    ret <2 x i64> zeroinitializer
+//
 __m128i test_mm_srli_si128_2(__m128i A) {
-  // CHECK-LABEL: test_mm_srli_si128_2
   // ret <2 x i64> zeroinitializer
   return _mm_srli_si128(A, 17);
 }
 
+//
+// X86-LABEL: define void @test_mm_store_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP3]], align 16
+// X86-NEXT:    ret void
+//
 void test_mm_store_pd(double* A, __m128d B) {
-  // CHECK-LABEL: test_mm_store_pd
-  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16
   _mm_store_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_store_pd1(
+// X86-SAME: ptr noundef [[X:%.*]], <2 x double> noundef [[Y:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I1:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I2:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[Y_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[Y]], ptr [[Y_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[Y_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store ptr [[TMP2]], ptr [[__DP_ADDR_I1]], align 4
+// X86-NEXT:    store <2 x double> [[TMP3]], ptr [[__A_ADDR_I2]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I2]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I2]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I]], ptr [[__A_ADDR_I2]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[__DP_ADDR_I1]], align 4
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[__A_ADDR_I2]], align 16
+// X86-NEXT:    store ptr [[TMP6]], ptr [[__DP_ADDR_I_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__DP_ADDR_I_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[TMP9]], align 16
+// X86-NEXT:    ret void
+//
 void test_mm_store_pd1(double* x, __m128d y) {
-  // CHECK-LABEL: test_mm_store_pd1
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
-  // CHECK: store <2 x double> %{{.*}}, ptr {{.*}}, align 16
   _mm_store_pd1(x, y);
 }
 
+//
+// X86-LABEL: define void @test_mm_store_sd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store double [[VECEXT_I]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_store_sd(double* A, __m128d B) {
-  // CHECK-LABEL: test_mm_store_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: store double %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm_store_sd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_store_si128(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 16
+// X86-NEXT:    ret void
+//
 void test_mm_store_si128(__m128i* A, __m128i B) {
-  // CHECK-LABEL: test_mm_store_si128
-  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 16
   _mm_store_si128(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_store1_pd(
+// X86-SAME: ptr noundef [[X:%.*]], <2 x double> noundef [[Y:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[Y_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[Y]], ptr [[Y_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[Y_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store ptr [[TMP4]], ptr [[__DP_ADDR_I_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x double>, ptr [[__A_ADDR_I_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[__DP_ADDR_I_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP7]], align 16
+// X86-NEXT:    ret void
+//
 void test_mm_store1_pd(double* x, __m128d y) {
-  // CHECK-LABEL: test_mm_store1_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
-  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16
   _mm_store1_pd(x, y);
 }
 
+//
+// X86-LABEL: define void @test_mm_storeh_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store double [[VECEXT_I]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storeh_pd(double* A, __m128d B) {
-  // CHECK-LABEL: test_mm_storeh_pd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
-  // CHECK: store double %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm_storeh_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_storel_epi64(
+// X86-SAME: <2 x i64> noundef [[X:%.*]], ptr noundef [[Y:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[X_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store <2 x i64> [[X]], ptr [[X_ADDR]], align 16
+// X86-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store i64 [[VECEXT_I]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storel_epi64(__m128i x, void* y) {
-  // CHECK-LABEL: test_mm_storel_epi64
-  // CHECK: extractelement <2 x i64> %{{.*}}, i32 0
-  // CHECK: store {{.*}} ptr {{.*}}, align 1{{$}}
   _mm_storel_epi64(y, x);
 }
 
+//
+// X86-LABEL: define void @test_mm_storel_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store double [[VECEXT_I]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storel_pd(double* A, __m128d B) {
-  // CHECK-LABEL: test_mm_storel_pd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: store double %{{.*}}, ptr %{{.*}}, align 1{{$}}
   _mm_storel_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_storer_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 0>
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP5]], align 16
+// X86-NEXT:    ret void
+//
 void test_mm_storer_pd(__m128d A, double* B) {
-  // CHECK-LABEL: test_mm_storer_pd
-  // CHECK: shufflevector <2 x double> {{.*}}, <2 x double> {{.*}}, <2 x i32> <i32 1, i32 0>
-  // CHECK: store {{.*}} ptr {{.*}}, align 16{{$}}
   _mm_storer_pd(B, A);
 }
 
+//
+// X86-LABEL: define void @test_mm_storeu_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__DP_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__DP_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storeu_pd(double* A, __m128d B) {
-  // CHECK-LABEL: test_mm_storeu_pd
-  // CHECK: store {{.*}} ptr {{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm_storeu_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_storeu_si128(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storeu_si128(__m128i* A, __m128i B) {
-  // CHECK-LABEL: test_mm_storeu_si128
-  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm_storeu_si128(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_storeu_si64(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store i64 [[VECEXT_I]], ptr [[TMP3]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storeu_si64(void* A, __m128i B) {
-  // CHECK-LABEL: test_mm_storeu_si64
-  // CHECK: [[EXT:%.*]] = extractelement <2 x i64> %{{.*}}, i32 0
-  // CHECK: store i64 [[EXT]], ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm_storeu_si64(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_storeu_si32(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[VECEXT_I]], ptr [[TMP4]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storeu_si32(void* A, __m128i B) {
-  // CHECK-LABEL: test_mm_storeu_si32
-  // CHECK: [[EXT:%.*]] = extractelement <4 x i32> %{{.*}}, i32 0
-  // CHECK: store i32 [[EXT]], ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm_storeu_si32(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_storeu_si16(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+// X86-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store i16 [[VECEXT_I]], ptr [[TMP4]], align 1
+// X86-NEXT:    ret void
+//
 void test_mm_storeu_si16(void* A, __m128i B) {
-  // CHECK-LABEL: test_mm_storeu_si16
-  // CHECK: [[EXT:%.*]] = extractelement <8 x i16> %{{.*}}, i32 0
-  // CHECK: store i16 [[EXT]], ptr %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
   _mm_storeu_si16(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_stream_pd(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP3]], align 16, !nontemporal [[META3:![0-9]+]]
+// X86-NEXT:    ret void
+//
 void test_mm_stream_pd(double *A, __m128d B) {
-  // CHECK-LABEL: test_mm_stream_pd
-  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
   _mm_stream_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_stream_pd_void(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[TMP3]], align 16, !nontemporal [[META3]]
+// X86-NEXT:    ret void
+//
 void test_mm_stream_pd_void(void *A, __m128d B) {
-  // CHECK-LABEL: test_mm_stream_pd_void
-  // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
   _mm_stream_pd(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_stream_si32(
+// X86-SAME: ptr noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 1, !nontemporal [[META3]]
+// X86-NEXT:    ret void
+//
 void test_mm_stream_si32(int *A, int B) {
-  // CHECK-LABEL: test_mm_stream_si32
-  // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si32(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_stream_si32_void(
+// X86-SAME: ptr noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca i32, align 4
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR]], align 4
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP1]], ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    [[TMP3:%.*]] = load i32, ptr [[__A_ADDR_I]], align 4
+// X86-NEXT:    store i32 [[TMP3]], ptr [[TMP2]], align 1, !nontemporal [[META3]]
+// X86-NEXT:    ret void
+//
 void test_mm_stream_si32_void(void *A, int B) {
-  // CHECK-LABEL: test_mm_stream_si32_void
-  // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si32(A, B);
 }
 
 #ifdef __x86_64__
+//
 void test_mm_stream_si64(long long *A, long long B) {
-  // X64-LABEL: test_mm_stream_si64
-  // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si64(A, B);
 }
 
+//
 void test_mm_stream_si64_void(void *A, long long B) {
-  // X64-LABEL: test_mm_stream_si64_void
-  // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
   _mm_stream_si64(A, B);
 }
 #endif
 
+//
+// X86-LABEL: define void @test_mm_stream_si128(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 16, !nontemporal [[META3]]
+// X86-NEXT:    ret void
+//
 void test_mm_stream_si128(__m128i *A, __m128i B) {
-  // CHECK-LABEL: test_mm_stream_si128
-  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
   _mm_stream_si128(A, B);
 }
 
+//
+// X86-LABEL: define void @test_mm_stream_si128_void(
+// X86-SAME: ptr noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__P_ADDR_I:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store ptr [[TMP0]], ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[__P_ADDR_I]], align 4
+// X86-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 16, !nontemporal [[META3]]
+// X86-NEXT:    ret void
+//
 void test_mm_stream_si128_void(void *A, __m128i B) {
-  // CHECK-LABEL: test_mm_stream_si128_void
-  // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
   _mm_stream_si128(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sub_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[SUB_I:%.*]] = sub <16 x i8> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[SUB_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_sub_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sub_epi8
-  // CHECK: sub <16 x i8>
   return _mm_sub_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sub_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[SUB_I:%.*]] = sub <8 x i16> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[SUB_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_sub_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sub_epi16
-  // CHECK: sub <8 x i16>
   return _mm_sub_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sub_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[SUB_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_sub_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sub_epi32
-  // CHECK: sub <4 x i32>
   return _mm_sub_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sub_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// X86-NEXT:    ret <2 x i64> [[SUB_I]]
+//
 __m128i test_mm_sub_epi64(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_sub_epi64
-  // CHECK: sub <2 x i64>
   return _mm_sub_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sub_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[SUB_I:%.*]] = fsub <2 x double> [[TMP2]], [[TMP3]]
+// X86-NEXT:    store <2 x double> [[SUB_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128d test_mm_sub_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_sub_pd
-  // CHECK: fsub <2 x double>
   return _mm_sub_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_sub_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECEXT1_I:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// X86-NEXT:    [[SUB_I:%.*]] = fsub double [[VECEXT1_I]], [[VECEXT_I]]
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x double> [[TMP4]], double [[SUB_I]], i32 0
+// X86-NEXT:    store <2 x double> [[VECINS_I]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP6]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP7]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP8]]
+//
 __m128d test_mm_sub_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_sub_sd
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
-  // CHECK: fsub double
-  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_sub_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_subs_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_subs_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_subs_epi8
-  // CHECK: call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_subs_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_subs_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_subs_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_subs_epi16
-  // CHECK: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_subs_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_subs_epu8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_subs_epu8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_subs_epu8
-  // CHECK-NOT: call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
-  // CHECK: call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_subs_epu8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_subs_epu16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[ELT_SAT_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP5]])
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[ELT_SAT_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_subs_epu16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_subs_epu16
-  // CHECK-NOT: call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
-  // CHECK: call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_subs_epu16(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_ucomieq_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomieq_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_ucomieq_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomieq_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_ucomige_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomige_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_ucomige_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomige_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_ucomigt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomigt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_ucomigt_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomigt_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_ucomile_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomile_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_ucomile_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomile_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_ucomilt_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomilt_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_ucomilt_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomilt_sd(A, B);
 }
 
+//
+// X86-LABEL: define i32 @test_mm_ucomineq_sd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> [[TMP2]], <2 x double> [[TMP3]])
+// X86-NEXT:    ret i32 [[TMP4]]
+//
 int test_mm_ucomineq_sd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_ucomineq_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomineq_sd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_undefined_pd(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[TMP0:%.*]] = freeze <2 x double> undef
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP2]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP3]]
+//
 __m128d test_mm_undefined_pd(void) {
-  // X64-LABEL: test_mm_undefined_pd
-  // X64: ret <2 x double> zeroinitializer
   //
-  // X86-LABEL: test_mm_undefined_pd
-  // X86: store <2 x double> zeroinitializer
   return _mm_undefined_pd();
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_undefined_si128(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[TMP0:%.*]] = freeze <2 x double> undef
+// X86-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP1]]
+//
 __m128i test_mm_undefined_si128(void) {
-  // CHECK-LABEL: test_mm_undefined_si128
-  // CHECK: ret <2 x i64> zeroinitializer
   return _mm_undefined_si128();
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpackhi_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[SHUFFLE_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_unpackhi_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpackhi_epi8
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   return _mm_unpackhi_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpackhi_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[SHUFFLE_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_unpackhi_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpackhi_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   return _mm_unpackhi_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpackhi_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_unpackhi_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpackhi_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   return _mm_unpackhi_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpackhi_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 3>
+// X86-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 __m128i test_mm_unpackhi_epi64(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpackhi_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
   return _mm_unpackhi_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpackhi_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 3>
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128d test_mm_unpackhi_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_unpackhi_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
   return _mm_unpackhi_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpacklo_epi8(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <16 x i8>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> [[TMP5]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[SHUFFLE_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_unpacklo_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpacklo_epi8
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
   return _mm_unpacklo_epi8(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpacklo_epi16(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <8 x i16>
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[SHUFFLE_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_unpacklo_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpacklo_epi16
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
   return _mm_unpacklo_epi16(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpacklo_epi32(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <4 x i32>
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[SHUFFLE_I]] to <2 x i64>
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128i test_mm_unpacklo_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpacklo_epi32
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   return _mm_unpacklo_epi32(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpacklo_epi64(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 0, i32 2>
+// X86-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
+//
 __m128i test_mm_unpacklo_epi64(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_unpacklo_epi64
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
   return _mm_unpacklo_epi64(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_unpacklo_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 0, i32 2>
+// X86-NEXT:    store <2 x double> [[SHUFFLE_I]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP4]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP5]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP6]]
+//
 __m128d test_mm_unpacklo_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_unpacklo_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
   return _mm_unpacklo_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_xor_pd(
+// X86-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[RETVAL_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[RETVAL:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    [[COERCE:%.*]] = alloca <2 x double>, align 16
+// X86-NEXT:    store <2 x double> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x double> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x double> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x double>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <2 x i64>
+// X86-NEXT:    [[TMP4:%.*]] = load <2 x double>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <2 x i64>
+// X86-NEXT:    [[XOR_I:%.*]] = xor <2 x i64> [[TMP3]], [[TMP5]]
+// X86-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[XOR_I]] to <2 x double>
+// X86-NEXT:    store <2 x double> [[TMP6]], ptr [[RETVAL_I]], align 16
+// X86-NEXT:    [[TMP7:%.*]] = load <2 x i64>, ptr [[RETVAL_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP7]], ptr [[COERCE]], align 16
+// X86-NEXT:    [[TMP8:%.*]] = load <2 x double>, ptr [[COERCE]], align 16
+// X86-NEXT:    store <2 x double> [[TMP8]], ptr [[RETVAL]], align 16
+// X86-NEXT:    [[TMP9:%.*]] = load <2 x i64>, ptr [[RETVAL]], align 16
+// X86-NEXT:    ret <2 x i64> [[TMP9]]
+//
 __m128d test_mm_xor_pd(__m128d A, __m128d B) {
-  // CHECK-LABEL: test_mm_xor_pd
-  // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_xor_pd(A, B);
 }
 
+//
+// X86-LABEL: define <2 x i64> @test_mm_xor_si128(
+// X86-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  entry:
+// X86-NEXT:    [[__A_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[__B_ADDR_I:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i64>, align 16
+// X86-NEXT:    store <2 x i64> [[A]], ptr [[A_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[B]], ptr [[B_ADDR]], align 16
+// X86-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP0]], ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    store <2 x i64> [[TMP1]], ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[__A_ADDR_I]], align 16
+// X86-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[__B_ADDR_I]], align 16
+// X86-NEXT:    [[XOR_I:%.*]] = xor <2 x i64> [[TMP2]], [[TMP3]]
+// X86-NEXT:    ret <2 x i64> [[XOR_I]]
+//
 __m128i test_mm_xor_si128(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_xor_si128
-  // CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_xor_si128(A, B);
 }
+//.
+// X86: [[META3]] = !{i32 1}
+//.
+//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+// CHECK: {{.*}}
+// X64: {{.*}}



More information about the cfe-commits mailing list