[llvm] r333795 - [X86] Expand the testing of expand and compress intrinsics

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 1 14:59:24 PDT 2018


Author: ctopper
Date: Fri Jun  1 14:59:24 2018
New Revision: 333795

URL: http://llvm.org/viewvc/llvm-project?rev=333795&view=rev
Log:
[X86] Expand the testing of expand and compress intrinsics

The avx512f intrinsic tests were in the avx512vl file. We were also missing some combinations of masking.

This does show that we fail to use the zero masking form of expand loads when the passthru is zero. I'll try to get that fixed shortly.

Modified:
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=333795&r1=333794&r2=333795&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Fri Jun  1 14:59:24 2018
@@ -1,6 +1,475 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
 
+
+define void @test_mask_compress_store_pd_512(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+
+define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcompresspd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcompresspd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_compress_pd_512(<8 x double> %data) {
+; CHECK-LABEL: test_compress_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+
+define void @test_compress_store_pd_512(i8* %addr, <8 x double> %data) {
+; CHECK-LABEL: test_compress_store_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_ps_512(i8* %addr, <16 x float> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_store_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.ps.512(i8* %addr, <16 x float> %data, i16 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.ps.512(i8* %addr, <16 x float> %data, i16 %mask)
+
+define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcompressps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_compress_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vcompressps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_compress_ps_512(<16 x float> %data) {
+; CHECK-LABEL: test_compress_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
+
+define void @test_compress_store_ps_512(i8* %addr, <16 x float> %data) {
+; CHECK-LABEL: test_compress_store_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.ps.512(i8* %addr, <16 x float> %data, i16 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_q_512(i8* %addr, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpcompressq %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+
+define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_compress_q_512(<8 x i64> %data) {
+; CHECK-LABEL: test_compress_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
+
+define void @test_compress_store_q_512(i8* %addr, <8 x i64> %data) {
+; CHECK-LABEL: test_compress_store_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_d_512(i8* %addr, <16 x i32> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_store_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpcompressd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.d.512(i8* %addr, <16 x i32> %data, i16 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.d.512(i8* %addr, <16 x i32> %data, i16 %mask)
+
+define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcompressd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_compress_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcompressd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_compress_d_512(<16 x i32> %data) {
+; CHECK-LABEL: test_compress_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
+
+define void @test_compress_store_d_512(i8* %addr, <16 x i32> %data) {
+; CHECK-LABEL: test_compress_store_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.d.512(i8* %addr, <16 x i32> %data, i16 -1)
+  ret void
+}
+
+define <8 x double> @test_mask_expand_load_pd_512(i8* %addr, <8 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_maskz_expand_load_pd_512(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+
+define <8 x double> @test_expand_pd_512(<8 x double> %data) {
+; CHECK-LABEL: test_expand_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vexpandpd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+
+define <8 x double> @test_expand_load_pd_512(i8* %addr, <8 x double> %data) {
+; CHECK-LABEL: test_expand_load_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_mask_expand_load_ps_512(i8* %addr, <16 x float> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_load_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(i8* %addr, <16 x float> %data, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_maskz_expand_load_ps_512(i8* %addr, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(i8* %addr, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(i8* %addr, <16 x float> %data, i16 %mask)
+
+define <16 x float> @test_expand_ps_512(<16 x float> %data) {
+; CHECK-LABEL: test_expand_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vexpandps %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
+
+define <16 x float> @test_expand_load_ps_512(i8* %addr, <16 x float> %data) {
+; CHECK-LABEL: test_expand_load_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(i8* %addr, <16 x float> %data, i16 -1)
+  ret <16 x float> %res
+}
+
+define <8 x i64> @test_mask_expand_load_q_512(i8* %addr, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpexpandq (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_maskz_expand_load_q_512(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandq (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+
+define <8 x i64> @test_expand_q_512(<8 x i64> %data) {
+; CHECK-LABEL: test_expand_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpexpandq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
+
+define <8 x i64> @test_expand_load_q_512(i8* %addr, <8 x i64> %data) {
+; CHECK-LABEL: test_expand_load_q_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @test_mask_expand_load_d_512(i8* %addr, <16 x i32> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_load_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpexpandd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(i8* %addr, <16 x i32> %data, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_maskz_expand_load_d_512(i8* %addr, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(i8* %addr, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(i8* %addr, <16 x i32> %data, i16 %mask)
+
+define <16 x i32> @test_expand_d_512(<16 x i32> %data) {
+; CHECK-LABEL: test_expand_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpexpandd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
+
+define <16 x i32> @test_expand_load_d_512(i8* %addr, <16 x i32> %data) {
+; CHECK-LABEL: test_expand_load_d_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(i8* %addr, <16 x i32> %data, i16 -1)
+  ret <16 x i32> %res
+}
+
 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
 ; CHECK-LABEL: test_rcp_ps_512:
 ; CHECK:       ## %bb.0:

Modified: llvm/trunk/test/CodeGen/X86/avx512vbmi2-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vbmi2-intrinsics.ll?rev=333795&r1=333794&r2=333795&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vbmi2-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vbmi2-intrinsics.ll Fri Jun  1 14:59:24 2018
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vbmi2 | FileCheck %s
 
-define <32 x i16> @test_expand_load_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
-; CHECK-LABEL: test_expand_load_w_512:
+define <32 x i16> @test_mask_expand_load_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
+; CHECK-LABEL: test_mask_expand_load_w_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpexpandw (%rdi), %zmm0 {%k1}
@@ -10,21 +10,62 @@ define <32 x i16> @test_expand_load_w_51
   %res = call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
   ret <32 x i16> %res
 }
-declare <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
 
-define void @test_compress_store_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
-; CHECK-LABEL: test_compress_store_w_512:
+define <32 x i16> @test_maskz_expand_load_w_512(i8* %addr, i32 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_w_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpcompressw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandw (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.mask.compress.store.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
-  ret void
+  %res = call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %addr, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
 }
-declare void @llvm.x86.avx512.mask.compress.store.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
 
-define <64 x i8> @test_expand_load_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
-; CHECK-LABEL: test_expand_load_b_512:
+declare <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
+
+define <32 x i16> @test_expand_w_512(<32 x i16> %data) {
+; CHECK-LABEL: test_expand_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16> %data, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_expand_w_512(<32 x i16> %data, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_expand_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpexpandw %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16> %data, <32 x i16> %passthru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_maskz_expand_w_512(<32 x i16> %data, i32 %mask) {
+; CHECK-LABEL: test_maskz_expand_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpexpandw %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16> %data, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16> %data, <32 x i16> %src0, i32 %mask)
+
+define <32 x i16> @test_expand_load_w_512(i8* %addr, <32 x i16> %data) {
+; CHECK-LABEL: test_expand_load_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %addr, <32 x i16> %data, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <64 x i8> @test_mask_expand_load_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
+; CHECK-LABEL: test_mask_expand_load_b_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovq %rsi, %k1
 ; CHECK-NEXT:    vpexpandb (%rdi), %zmm0 {%k1}
@@ -32,64 +73,163 @@ define <64 x i8> @test_expand_load_b_512
   %res = call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
   ret <64 x i8> %res
 }
-declare <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
 
-define void @test_compress_store_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
-; CHECK-LABEL: test_compress_store_b_512:
+define <64 x i8> @test_maskz_expand_load_b_512(i8* %addr, i64 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_b_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovq %rsi, %k1
-; CHECK-NEXT:    vpcompressb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandb (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+  %res = call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %addr, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+
+define <64 x i8> @test_expand_b_512(<64 x i8> %data) {
+; CHECK-LABEL: test_expand_b_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_expand_b_512(<64 x i8> %data, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: test_mask_expand_b_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovq %rdi, %k1
+; CHECK-NEXT:    vpexpandb %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8> %passthru, i64 %mask)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_maskz_expand_b_512(<64 x i8> %data, i64 %mask) {
+; CHECK-LABEL: test_maskz_expand_b_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovq %rdi, %k1
+; CHECK-NEXT:    vpexpandb %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8> %src0, i64 %mask)
+
+define <64 x i8> @test_expand_load_b_512(i8* %addr, <64 x i8> %data) {
+; CHECK-LABEL: test_expand_load_b_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0
+; CHECK-NEXT:    retq
+  %res = call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %addr, <64 x i8> %data, i64 -1)
+  ret <64 x i8> %res
+}
+
+define void @test_mask_compress_store_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
+; CHECK-LABEL: test_mask_compress_store_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpcompressw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
   ret void
 }
-declare void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
 
-define <32 x i16> @test_compress_w_512(<32 x i16> %data, <32 x i16> %src, i32 %mask) {
-; CHECK-LABEL: test_compress_w_512:
+declare void @llvm.x86.avx512.mask.compress.store.w.512(i8* %addr, <32 x i16> %data, i32 %mask)
+
+define <32 x i16> @test_mask_compress_w_512(<32 x i16> %data, <32 x i16> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_compress_w_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpcompressw %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16> %data, <32 x i16> %src, i32 %mask)
+  %res = call <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16> %data, <32 x i16> %passthru, i32 %mask)
   ret <32 x i16> %res
 }
-declare <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16>, <32 x i16>, i32)
 
-define <64 x i8> @test_compress_b_512(<64 x i8> %data, <64 x i8> %src, i64 %mask) {
-; CHECK-LABEL: test_compress_b_512:
+define <32 x i16> @test_maskz_compress_w_512(<32 x i16> %data, i32 %mask) {
+; CHECK-LABEL: test_maskz_compress_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpcompressw %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16> %data, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_compress_w_512(<32 x i16> %data) {
+; CHECK-LABEL: test_compress_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16> %data, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16> %data, <32 x i16> %src0, i32 %mask)
+
+define void @test_compress_store_w_512(i8* %addr, <32 x i16> %data) {
+; CHECK-LABEL: test_compress_store_w_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.w.512(i8* %addr, <32 x i16> %data, i32 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
+; CHECK-LABEL: test_mask_compress_store_b_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovq %rsi, %k1
+; CHECK-NEXT:    vpcompressb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 %mask)
+
+define <64 x i8> @test_mask_compress_b_512(<64 x i8> %data, <64 x i8> %passthru, i64 %mask) {
+; CHECK-LABEL: test_mask_compress_b_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovq %rdi, %k1
 ; CHECK-NEXT:    vpcompressb %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8> %data, <64 x i8> %src, i64 %mask)
+  %res = call <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8> %data, <64 x i8> %passthru, i64 %mask)
   ret <64 x i8> %res
 }
-declare <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8>, <64 x i8>, i64)
 
-define <32 x i16> @test_expand_w_512(i8* %addr, <32 x i16> %data, i32 %mask) {
-; CHECK-LABEL: test_expand_w_512:
+define <64 x i8> @test_maskz_compress_b_512(<64 x i8> %data, i64 %mask) {
+; CHECK-LABEL: test_maskz_compress_b_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpexpandw %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovq %rdi, %k1
+; CHECK-NEXT:    vpcompressb %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16> %data, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
+  %res = call <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8> %data, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %res
 }
-declare <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16>, <32 x i16>, i32)
 
-define <64 x i8> @test_expand_b_512(i8* %addr, <64 x i8> %data, i64 %mask) {
-; CHECK-LABEL: test_expand_b_512:
+define <64 x i8> @test_compress_b_512(<64 x i8> %data) {
+; CHECK-LABEL: test_compress_b_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovq %rsi, %k1
-; CHECK-NEXT:    vpexpandb %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8> %data, <64 x i8> zeroinitializer, i64 %mask)
+  %res = call <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8> %data, <64 x i8> undef, i64 -1)
   ret <64 x i8> %res
 }
-declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8>, <64 x i8>, i64)
+
+declare <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8> %data, <64 x i8> %src0, i64 %mask)
+
+define void @test_compress_store_b_512(i8* %addr, <64 x i8> %data) {
+; CHECK-LABEL: test_compress_store_b_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %addr, <64 x i8> %data, i64 -1)
+  ret void
+}
 
 define <16 x i32>@test_int_x86_avx512_mask_vpshld_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_d_512:

Modified: llvm/trunk/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll?rev=333795&r1=333794&r2=333795&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vbmi2vl-intrinsics.ll Fri Jun  1 14:59:24 2018
@@ -1,19 +1,157 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl,+avx512vbmi2 | FileCheck %s
 
-define <16 x i16> @test_compress_w_256(<16 x i16> %src, <16 x i16> %data, i16 %mask) {
-; CHECK-LABEL: test_compress_w_256:
+define <8 x i16> @test_mask_expand_load_w_128(i8* %addr, <8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpexpandw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_maskz_expand_load_w_128(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+
+define <8 x i16> @test_expand_w_128(<8 x i16> %data) {
+; CHECK-LABEL: test_expand_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %data, <8 x i16> undef, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_expand_w_128(<8 x i16> %data, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_w_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcompressw %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpexpandw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %data, <16 x i16> %src, i16 %mask)
-  ret <16 x i16> %res
+  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %data, <8 x i16> %passthru, i8 %mask)
+  ret <8 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16>, <16 x i16>, i16)
 
-define <8 x i16> @test_compress_w_128(<8 x i16> %data, i8 %mask) {
-; CHECK-LABEL: test_compress_w_128:
+define <8 x i16> @test_maskz_expand_w_128(<8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpexpandw %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %data, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %data, <8 x i16> %src0, i8 %mask)
+
+define <8 x i16> @test_expand_load_w_128(i8* %addr, <8 x i16> %data) {
+; CHECK-LABEL: test_expand_load_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> %data, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <16 x i8> @test_mask_expand_load_b_128(i8* %addr, <16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_load_b_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpexpandb (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_maskz_expand_load_b_128(i8* %addr, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_b_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandb (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+
+define <16 x i8> @test_expand_b_128(<16 x i8> %data) {
+; CHECK-LABEL: test_expand_b_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %data, <16 x i8> undef, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_expand_b_128(<16 x i8> %data, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_b_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpexpandb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %data, <16 x i8> %passthru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_maskz_expand_b_128(<16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_b_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpexpandb %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %data, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %data, <16 x i8> %src0, i16 %mask)
+
+define <16 x i8> @test_expand_load_b_128(i8* %addr, <16 x i8> %data) {
+; CHECK-LABEL: test_expand_load_b_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %xmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> %data, i16 -1)
+  ret <16 x i8> %res
+}
+
+define void @test_mask_compress_store_w_128(i8* %addr, <8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpcompressw %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
+
+define <8 x i16> @test_mask_compress_w_128(<8 x i16> %data, <8 x i16> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpcompressw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %data, <8 x i16> %passthru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_maskz_compress_w_128(<8 x i16> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_w_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpcompressw %xmm0, %xmm0 {%k1} {z}
@@ -21,21 +159,51 @@ define <8 x i16> @test_compress_w_128(<8
   %res = call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %data, <8 x i16> zeroinitializer, i8 %mask)
   ret <8 x i16> %res
 }
-declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16>, <8 x i16>, i8)
 
-define <32 x i8> @test_compress_b_256(<32 x i8> %src, <32 x i8> %data, i32 %mask) {
-; CHECK-LABEL: test_compress_b_256:
+define <8 x i16> @test_compress_w_128(<8 x i16> %data) {
+; CHECK-LABEL: test_compress_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %data, <8 x i16> undef, i8 -1)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16> %data, <8 x i16> %src0, i8 %mask)
+
+define void @test_compress_store_w_128(i8* %addr, <8 x i16> %data) {
+; CHECK-LABEL: test_compress_store_w_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.w.128(i8* %addr, <8 x i16> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_b_128(i8* %addr, <16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_store_b_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpcompressb %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+
+define <16 x i8> @test_mask_compress_b_128(<16 x i8> %data, <16 x i8> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_b_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcompressb %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpcompressb %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8> %src, i32 %mask)
-  ret <32 x i8> %res
+  %res = call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %data, <16 x i8> %passthru, i16 %mask)
+  ret <16 x i8> %res
 }
-declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8>, <32 x i8>, i32)
 
-define <16 x i8> @test_compress_b_128(<16 x i8> %data, i16 %mask) {
-; CHECK-LABEL: test_compress_b_128:
+define <16 x i8> @test_maskz_compress_b_128(<16 x i8> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_compress_b_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpcompressb %xmm0, %xmm0 {%k1} {z}
@@ -43,78 +211,154 @@ define <16 x i8> @test_compress_b_128(<1
   %res = call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %data, <16 x i8> zeroinitializer, i16 %mask)
   ret <16 x i8> %res
 }
-declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8>, <16 x i8>, i16)
 
-define <32 x i8> @test_expand_b_256(<32 x i8> %data, <32 x i8> %src, i32 %mask) {
-; CHECK-LABEL: test_expand_b_256:
+define <16 x i8> @test_compress_b_128(<16 x i8> %data) {
+; CHECK-LABEL: test_compress_b_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpexpandb %ymm0, %ymm1 {%k1}
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.b.256( <32 x i8> %data, <32 x i8> %src, i32 %mask)
-  ret <32 x i8> %res
+  %res = call <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %data, <16 x i8> undef, i16 -1)
+  ret <16 x i8> %res
 }
-declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8>, <32 x i8>, i32)
 
-define <16 x i8> @test_expand_b_128(<16 x i8> %data, i16 %mask) {
-; CHECK-LABEL: test_expand_b_128:
+declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8> %data, <16 x i8> %src0, i16 %mask)
+
+define void @test_compress_store_b_128(i8* %addr, <16 x i8> %data) {
+; CHECK-LABEL: test_compress_store_b_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpexpandb %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovups %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
-  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8> %data, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
+  call void @llvm.x86.avx512.mask.compress.store.b.128(i8* %addr, <16 x i8> %data, i16 -1)
+  ret void
+}
+
+define <16 x i16> @test_mask_expand_load_w_256(i8* %addr, <16 x i16> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_load_w_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpexpandw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_maskz_expand_load_w_256(i8* %addr, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_w_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
 }
-declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8>, <16 x i8>, i16)
 
-define <16 x i16> @test_expand_w_256(<16 x i16> %data, <16 x i16> %src, i16 %mask) {
+declare <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
+
+define <16 x i16> @test_expand_w_256(<16 x i16> %data) {
 ; CHECK-LABEL: test_expand_w_256:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %data, <16 x i16> undef, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_expand_w_256(<16 x i16> %data, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_expand_w_256:
+; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpexpandw %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.w.256( <16 x i16> %data, <16 x i16> %src, i16 %mask)
+  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %data, <16 x i16> %passthru, i16 %mask)
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16>, <16 x i16>, i16)
 
-define <8 x i16> @test_expand_w_128(<8 x i16> %data, i8 %mask) {
-; CHECK-LABEL: test_expand_w_128:
+define <16 x i16> @test_maskz_expand_w_256(<16 x i16> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_expand_w_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpexpandw %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpexpandw %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16> %data, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
+  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %data, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
 }
-declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16>, <8 x i16>, i8)
 
-define <16 x i16> @test_expand_load_w_256(i8* %addr, <16 x i16> %data, i16 %mask) {
+declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16> %data, <16 x i16> %src0, i16 %mask)
+
+define <16 x i16> @test_expand_load_w_256(i8* %addr, <16 x i16> %data) {
 ; CHECK-LABEL: test_expand_load_w_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpexpandw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    vmovups (%rdi), %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
+  %res = call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> %data, i16 -1)
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
 
-define <8 x i16> @test_expand_load_w_128(i8* %addr, <8 x i16> %data, i8 %mask) {
-; CHECK-LABEL: test_expand_load_w_128:
+define <32 x i8> @test_mask_expand_load_b_256(i8* %addr, <32 x i8> %data, i32 %mask) {
+; CHECK-LABEL: test_mask_expand_load_b_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpexpandw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    vpexpandb (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
-  %res = call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
-  ret <8 x i16> %res
+  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
+  ret <32 x i8> %res
 }
-declare <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
 
-define void @test_compress_store_w_256(i8* %addr, <16 x i16> %data, i16 %mask) {
-; CHECK-LABEL: test_compress_store_w_256:
+define <32 x i8> @test_maskz_expand_load_b_256(i8* %addr, i32 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpexpandb (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
+
+define <32 x i8> @test_expand_b_256(<32 x i8> %data) {
+; CHECK-LABEL: test_expand_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %data, <32 x i8> undef, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_expand_b_256(<32 x i8> %data, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_expand_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpexpandb %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %data, <32 x i8> %passthru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_maskz_expand_b_256(<32 x i8> %data, i32 %mask) {
+; CHECK-LABEL: test_maskz_expand_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpexpandb %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %data, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8> %data, <32 x i8> %src0, i32 %mask)
+
+define <32 x i8> @test_expand_load_b_256(i8* %addr, <32 x i8> %data) {
+; CHECK-LABEL: test_expand_load_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %ymm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> %data, i32 -1)
+  ret <32 x i8> %res
+}
+
+define void @test_mask_compress_store_w_256(i8* %addr, <16 x i16> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_store_w_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpcompressw %ymm0, (%rdi) {%k1}
@@ -122,43 +366,51 @@ define void @test_compress_store_w_256(i
   call void @llvm.x86.avx512.mask.compress.store.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
   ret void
 }
+
 declare void @llvm.x86.avx512.mask.compress.store.w.256(i8* %addr, <16 x i16> %data, i16 %mask)
 
-define void @test_compress_store_w_128(i8* %addr, <8 x i16> %data, i8 %mask) {
-; CHECK-LABEL: test_compress_store_w_128:
+define <16 x i16> @test_mask_compress_w_256(<16 x i16> %data, <16 x i16> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_compress_w_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpcompressw %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpcompressw %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.mask.compress.store.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
-  ret void
+  %res = call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %data, <16 x i16> %passthru, i16 %mask)
+  ret <16 x i16> %res
 }
-declare void @llvm.x86.avx512.mask.compress.store.w.128(i8* %addr, <8 x i16> %data, i8 %mask)
 
-define <32 x i8> @test_expand_load_b_256(i8* %addr, <32 x i8> %data, i32 %mask) {
-; CHECK-LABEL: test_expand_load_b_256:
+define <16 x i16> @test_maskz_compress_w_256(<16 x i16> %data, i16 %mask) {
+; CHECK-LABEL: test_maskz_compress_w_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpexpandb (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpcompressw %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
-  ret <32 x i8> %res
+  %res = call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %data, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
 }
-declare <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
 
-define <16 x i8> @test_expand_load_b_128(i8* %addr, <16 x i8> %data, i16 %mask) {
-; CHECK-LABEL: test_expand_load_b_128:
+define <16 x i16> @test_compress_w_256(<16 x i16> %data) {
+; CHECK-LABEL: test_compress_w_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpexpandb (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
-  %res = call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
-  ret <16 x i8> %res
+  %res = call <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %data, <16 x i16> undef, i16 -1)
+  ret <16 x i16> %res
 }
-declare <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
 
-define void @test_compress_store_b_256(i8* %addr, <32 x i8> %data, i32 %mask) {
-; CHECK-LABEL: test_compress_store_b_256:
+declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16> %data, <16 x i16> %src0, i16 %mask)
+
+define void @test_compress_store_w_256(i8* %addr, <16 x i16> %data) {
+; CHECK-LABEL: test_compress_store_w_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, (%rdi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.w.256(i8* %addr, <16 x i16> %data, i16 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_b_256(i8* %addr, <32 x i8> %data, i32 %mask) {
+; CHECK-LABEL: test_mask_compress_store_b_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpcompressb %ymm0, (%rdi) {%k1}
@@ -166,18 +418,48 @@ define void @test_compress_store_b_256(i
   call void @llvm.x86.avx512.mask.compress.store.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
   ret void
 }
+
 declare void @llvm.x86.avx512.mask.compress.store.b.256(i8* %addr, <32 x i8> %data, i32 %mask)
 
-define void @test_compress_store_b_128(i8* %addr, <16 x i8> %data, i16 %mask) {
-; CHECK-LABEL: test_compress_store_b_128:
+define <32 x i8> @test_mask_compress_b_256(<32 x i8> %data, <32 x i8> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_compress_b_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovd %esi, %k1
-; CHECK-NEXT:    vpcompressb %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpcompressb %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.mask.compress.store.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
+  %res = call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8> %passthru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_maskz_compress_b_256(<32 x i8> %data, i32 %mask) {
+; CHECK-LABEL: test_maskz_compress_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpcompressb %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_compress_b_256(<32 x i8> %data) {
+; CHECK-LABEL: test_compress_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq
+  %res = call <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8> undef, i32 -1)
+  ret <32 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8> %data, <32 x i8> %src0, i32 %mask)
+
+define void @test_compress_store_b_256(i8* %addr, <32 x i8> %data) {
+; CHECK-LABEL: test_compress_store_b_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, (%rdi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.compress.store.b.256(i8* %addr, <32 x i8> %data, i32 -1)
   ret void
 }
-declare void @llvm.x86.avx512.mask.compress.store.b.128(i8* %addr, <16 x i8> %data, i16 %mask)
 
 define <4 x i32>@test_int_x86_avx512_mask_vpshld_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpshld_d_128:

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll?rev=333795&r1=333794&r2=333795&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll Fri Jun  1 14:59:24 2018
@@ -1,32 +1,61 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl --show-mc-encoding| FileCheck %s
 
-define void @compr1(i8* %addr, <8 x double> %data, i8 %mask) {
-; CHECK-LABEL: compr1:
+
+define void @test_mask_compress_store_pd_128(i8* %addr, <2 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_pd_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vcompresspd %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8a,0x07]
+; CHECK-NEXT:    vcompresspd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x8a,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+  call void @llvm.x86.avx512.mask.compress.store.pd.128(i8* %addr, <2 x double> %data, i8 %mask)
   ret void
 }
 
-declare void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
+declare void @llvm.x86.avx512.mask.compress.store.pd.128(i8* %addr, <2 x double> %data, i8 %mask)
 
-define void @compr2(i8* %addr, <4 x double> %data, i8 %mask) {
-; CHECK-LABEL: compr2:
+define <2 x double> @test_mask_compress_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_pd_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vcompresspd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x07]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vcompresspd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x8a,0xc1]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
-  ret void
+  %res = call <2 x double> @llvm.x86.avx512.mask.compress.pd.128(<2 x double> %data, <2 x double> %passthru, i8 %mask)
+  ret <2 x double> %res
 }
 
-declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+define <2 x double> @test_maskz_compress_pd_128(<2 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_pd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vcompresspd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x8a,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.compress.pd.128(<2 x double> %data, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
 
-define void @compr3(i8* %addr, <4 x float> %data, i8 %mask) {
-; CHECK-LABEL: compr3:
+define <2 x double> @test_compress_pd_128(<2 x double> %data) {
+; CHECK-LABEL: test_compress_pd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.compress.pd.128(<2 x double> %data, <2 x double> undef, i8 -1)
+  ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.compress.pd.128(<2 x double> %data, <2 x double> %src0, i8 %mask)
+
+define void @test_compress_store_pd_128(i8* %addr, <2 x double> %data) {
+; CHECK-LABEL: test_compress_store_pd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.pd.128(i8* %addr, <2 x double> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_ps_128(i8* %addr, <4 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_ps_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vcompressps %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8a,0x07]
@@ -37,143 +66,215 @@ define void @compr3(i8* %addr, <4 x floa
 
 declare void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
 
-define <8 x double> @compr4(i8* %addr, <8 x double> %data, i8 %mask) {
-; CHECK-LABEL: compr4:
+define <4 x float> @test_mask_compress_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_ps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vcompresspd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x8a,0xc0]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vcompressps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8a,0xc1]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
+  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %passthru, i8 %mask)
+  ret <4 x float> %res
 }
 
-declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
-
-define <4 x double> @compr5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
-; CHECK-LABEL: compr5:
+define <4 x float> @test_maskz_compress_ps_128(<4 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_ps_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    vcompressps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8a,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
-  ret <4 x double> %res
+  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
 }
 
-declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
-
-define <4 x float> @compr6(<4 x float> %data, i8 %mask) {
-; CHECK-LABEL: compr6:
+define <4 x float> @test_compress_ps_128(<4 x float> %data) {
+; CHECK-LABEL: test_compress_ps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vcompressps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8a,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
+  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> undef, i8 -1)
   ret <4 x float> %res
 }
 
 declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
 
-define void @compr7(i8* %addr, <8 x double> %data) {
-; CHECK-LABEL: compr7:
+define void @test_compress_store_ps_128(i8* %addr, <4 x float> %data) {
+; CHECK-LABEL: test_compress_store_ps_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07]
+; CHECK-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1)
+  call void @llvm.x86.avx512.mask.compress.store.ps.128(i8* %addr, <4 x float> %data, i8 -1)
   ret void
 }
 
-define <4 x float> @compr8(<4 x float> %data) {
-; CHECK-LABEL: compr8:
+define void @test_mask_compress_store_q_128(i8* %addr, <2 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_q_128:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpcompressq %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x8b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
-  ret <4 x float> %res
+  call void @llvm.x86.avx512.mask.compress.store.q.128(i8* %addr, <2 x i64> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.q.128(i8* %addr, <2 x i64> %data, i8 %mask)
+
+define <2 x i64> @test_mask_compress_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcompressq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x8b,0xc1]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.compress.q.128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask)
+  ret <2 x i64> %res
 }
 
-define void @compr9(i8* %addr, <8 x i64> %data, i8 %mask) {
-; CHECK-LABEL: compr9:
+define <2 x i64> @test_maskz_compress_q_128(<2 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcompressq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x8b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.compress.q.128(<2 x i64> %data, <2 x i64> zeroinitializer, i8 %mask)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_compress_q_128(<2 x i64> %data) {
+; CHECK-LABEL: test_compress_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.compress.q.128(<2 x i64> %data, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.compress.q.128(<2 x i64> %data, <2 x i64> %src0, i8 %mask)
+
+define void @test_compress_store_q_128(i8* %addr, <2 x i64> %data) {
+; CHECK-LABEL: test_compress_store_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.q.128(i8* %addr, <2 x i64> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_d_128(i8* %addr, <4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_d_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpcompressq %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x8b,0x07]
+; CHECK-NEXT:    vpcompressd %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  call void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+  call void @llvm.x86.avx512.mask.compress.store.d.128(i8* %addr, <4 x i32> %data, i8 %mask)
   ret void
 }
 
-declare void @llvm.x86.avx512.mask.compress.store.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+declare void @llvm.x86.avx512.mask.compress.store.d.128(i8* %addr, <4 x i32> %data, i8 %mask)
 
-define <4 x i32> @compr10(<4 x i32> %data, i8 %mask) {
-; CHECK-LABEL: compr10:
+define <4 x i32> @test_mask_compress_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_d_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
+; CHECK-NEXT:    vpcompressd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x8b,0xc1]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
+  %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask)
   ret <4 x i32> %res
 }
 
-
- at xmm = common global <4 x i32> zeroinitializer, align 16
- at k8 = common global i8 0, align 1
-
-define i32 @compr11() {
-; CHECK-LABEL: compr11:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movq _xmm@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
-; CHECK-NEXT:    ## fixup A - offset: 3, value: _xmm at GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
-; CHECK-NEXT:    vmovdqa (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x00]
-; CHECK-NEXT:    movq _k8@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A]
-; CHECK-NEXT:    ## fixup A - offset: 3, value: _k8 at GOTPCREL-4, kind: reloc_riprel_4byte_movq_load
-; CHECK-NEXT:    movzbl (%rax), %eax ## encoding: [0x0f,0xb6,0x00]
-; CHECK-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+define <4 x i32> @test_maskz_compress_d_128(<4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpcompressd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x8b,0xc0]
-; CHECK-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x44,0x24,0xd8]
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
-; CHECK-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp) ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7f,0x44,0x24,0xe8]
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-entry:
-  %.compoundliteral = alloca <2 x i64>, align 16
-  %res = alloca <4 x i32>, align 16
-  %a0 = load <4 x i32>, <4 x i32>* @xmm, align 16
-  %a2 = load i8, i8* @k8, align 1
-  %a21 = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %a0, <4 x i32> zeroinitializer, i8 %a2) #2
-  store volatile <4 x i32> %a21, <4 x i32>* %res, align 16
-  store <2 x i64> zeroinitializer, <2 x i64>* %.compoundliteral, align 16
-  ret i32 0
+  %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_compress_d_128(<4 x i32> %data) {
+; CHECK-LABEL: test_compress_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
 }
 
 declare <4 x i32> @llvm.x86.avx512.mask.compress.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
 
-; Expand
+define void @test_compress_store_d_128(i8* %addr, <4 x i32> %data) {
+; CHECK-LABEL: test_compress_store_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %xmm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.d.128(i8* %addr, <4 x i32> %data, i8 -1)
+  ret void
+}
 
-define <8 x double> @expand1(i8* %addr, <8 x double> %data, i8 %mask) {
-; CHECK-LABEL: expand1:
+define <2 x double> @test_mask_expand_load_pd_128(i8* %addr, <2 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_pd_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vexpandpd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x88,0x07]
+; CHECK-NEXT:    vexpandpd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x88,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
-  ret <8 x double> %res
+  %res = call <2 x double> @llvm.x86.avx512.mask.expand.load.pd.128(i8* %addr, <2 x double> %data, i8 %mask)
+  ret <2 x double> %res
 }
 
-declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 %mask)
-
-define <4 x double> @expand2(i8* %addr, <4 x double> %data, i8 %mask) {
-; CHECK-LABEL: expand2:
+define <2 x double> @test_maskz_expand_load_pd_128(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_pd_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vexpandpd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x88,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
-  ret <4 x double> %res
+  %res = call <2 x double> @llvm.x86.avx512.mask.expand.load.pd.128(i8* %addr, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
 }
 
-declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+declare <2 x double> @llvm.x86.avx512.mask.expand.load.pd.128(i8* %addr, <2 x double> %data, i8 %mask)
+
+define <2 x double> @test_expand_pd_128(<2 x double> %data) {
+; CHECK-LABEL: test_expand_pd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.expand.pd.128(<2 x double> %data, <2 x double> undef, i8 -1)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_expand_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_pd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vexpandpd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x88,0xc8]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.expand.pd.128(<2 x double> %data, <2 x double> %passthru, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_expand_pd_128(<2 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_pd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vexpandpd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x88,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.expand.pd.128(<2 x double> %data, <2 x double> zeroinitializer, i8 %mask)
+  ret <2 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.expand.pd.128(<2 x double> %data, <2 x double> %src0, i8 %mask)
+
+define <2 x double> @test_expand_load_pd_128(i8* %addr, <2 x double> %data) {
+; CHECK-LABEL: test_expand_load_pd_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.expand.load.pd.128(i8* %addr, <2 x double> %data, i8 -1)
+  ret <2 x double> %res
+}
 
-define <4 x float> @expand3(i8* %addr, <4 x float> %data, i8 %mask) {
-; CHECK-LABEL: expand3:
+define <4 x float> @test_mask_expand_load_ps_128(i8* %addr, <4 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_ps_128:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
@@ -182,108 +283,644 @@ define <4 x float> @expand3(i8* %addr, <
   ret <4 x float> %res
 }
 
+define <4 x float> @test_maskz_expand_load_ps_128(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_ps_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vexpandps (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
 declare <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 %mask)
 
-define <8 x double> @expand4(i8* %addr, <8 x double> %data, i8 %mask) {
-; CHECK-LABEL: expand4:
+define <4 x float> @test_expand_ps_128(<4 x float> %data) {
+; CHECK-LABEL: test_expand_ps_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> undef, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_expand_ps_128(<4 x float> %data, <4 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_ps_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vexpandps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x88,0xc8]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %passthru, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_expand_ps_128(<4 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_ps_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vexpandps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x88,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
+
+define <4 x float> @test_expand_load_ps_128(i8* %addr, <4 x float> %data) {
+; CHECK-LABEL: test_expand_load_ps_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.expand.load.ps.128(i8* %addr, <4 x float> %data, i8 -1)
+  ret <4 x float> %res
+}
+
+define <2 x i64> @test_mask_expand_load_q_128(i8* %addr, <2 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpexpandq (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x89,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.expand.load.q.128(i8* %addr, <2 x i64> %data, i8 %mask)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_maskz_expand_load_q_128(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpexpandq (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x89,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.expand.load.q.128(i8* %addr, <2 x i64> zeroinitializer, i8 %mask)
+  ret <2 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.expand.load.q.128(i8* %addr, <2 x i64> %data, i8 %mask)
+
+define <2 x i64> @test_expand_q_128(<2 x i64> %data) {
+; CHECK-LABEL: test_expand_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.expand.q.128(<2 x i64> %data, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_mask_expand_q_128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpexpandq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x89,0xc8]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.expand.q.128(<2 x i64> %data, <2 x i64> %passthru, i8 %mask)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_maskz_expand_q_128(<2 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpexpandq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x89,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.expand.q.128(<2 x i64> %data, <2 x i64> zeroinitializer, i8 %mask)
+  ret <2 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.expand.q.128(<2 x i64> %data, <2 x i64> %src0, i8 %mask)
+
+define <2 x i64> @test_expand_load_q_128(i8* %addr, <2 x i64> %data) {
+; CHECK-LABEL: test_expand_load_q_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.mask.expand.load.q.128(i8* %addr, <2 x i64> %data, i8 -1)
+  ret <2 x i64> %res
+}
+
+define <4 x i32> @test_mask_expand_load_d_128(i8* %addr, <4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpexpandd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x89,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.load.d.128(i8* %addr, <4 x i32> %data, i8 %mask)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_maskz_expand_load_d_128(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpexpandd (%rdi), %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x89,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.load.d.128(i8* %addr, <4 x i32> zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.expand.load.d.128(i8* %addr, <4 x i32> %data, i8 %mask)
+
+define <4 x i32> @test_expand_d_128(<4 x i32> %data) {
+; CHECK-LABEL: test_expand_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_mask_expand_d_128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpexpandd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x89,0xc8]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %passthru, i8 %mask)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_maskz_expand_d_128(<4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
+
+define <4 x i32> @test_expand_load_d_128(i8* %addr, <4 x i32> %data) {
+; CHECK-LABEL: test_expand_load_d_128:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.load.d.128(i8* %addr, <4 x i32> %data, i8 -1)
+  ret <4 x i32> %res
+}
+
+define void @test_mask_compress_store_pd_256(i8* %addr, <4 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_pd_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x88,0xc0]
+; CHECK-NEXT:    vcompresspd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
+  call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+  ret void
 }
 
-declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+declare void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
 
-define <4 x double> @expand5(<4 x double> %data, <4 x double> %src0, i8 %mask) {
-; CHECK-LABEL: expand5:
+define <4 x double> @test_mask_compress_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vcompresspd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8a,0xc1]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %passthru, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_maskz_compress_pd_256(<4 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vcompresspd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x8a,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_compress_pd_256(<4 x double> %data) {
+; CHECK-LABEL: test_compress_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> undef, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.compress.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
+
+define void @test_compress_store_pd_256(i8* %addr, <4 x double> %data) {
+; CHECK-LABEL: test_compress_store_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.pd.256(i8* %addr, <4 x double> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_ps_256(i8* %addr, <8 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vcompressps %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x8a,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.ps.256(i8* %addr, <8 x float> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.ps.256(i8* %addr, <8 x float> %data, i8 %mask)
+
+define <8 x float> @test_mask_compress_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vcompressps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x8a,0xc1]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.compress.ps.256(<8 x float> %data, <8 x float> %passthru, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_maskz_compress_ps_256(<8 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vcompressps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x8a,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.compress.ps.256(<8 x float> %data, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_compress_ps_256(<8 x float> %data) {
+; CHECK-LABEL: test_compress_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.compress.ps.256(<8 x float> %data, <8 x float> undef, i8 -1)
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.compress.ps.256(<8 x float> %data, <8 x float> %src0, i8 %mask)
+
+define void @test_compress_store_ps_256(i8* %addr, <8 x float> %data) {
+; CHECK-LABEL: test_compress_store_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.ps.256(i8* %addr, <8 x float> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_q_256(i8* %addr, <4 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_q_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpcompressq %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.q.256(i8* %addr, <4 x i64> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.q.256(i8* %addr, <4 x i64> %data, i8 %mask)
+
+define <4 x i64> @test_mask_compress_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_q_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcompressq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x8b,0xc1]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.compress.q.256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_maskz_compress_q_256(<4 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_q_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcompressq %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x8b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.compress.q.256(<4 x i64> %data, <4 x i64> zeroinitializer, i8 %mask)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_compress_q_256(<4 x i64> %data) {
+; CHECK-LABEL: test_compress_q_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.compress.q.256(<4 x i64> %data, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.compress.q.256(<4 x i64> %data, <4 x i64> %src0, i8 %mask)
+
+define void @test_compress_store_q_256(i8* %addr, <4 x i64> %data) {
+; CHECK-LABEL: test_compress_store_q_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.q.256(i8* %addr, <4 x i64> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_d_256(i8* %addr, <8 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_store_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpcompressd %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x8b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.d.256(i8* %addr, <8 x i32> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.d.256(i8* %addr, <8 x i32> %data, i8 %mask)
+
+define <8 x i32> @test_mask_compress_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_compress_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcompressd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x8b,0xc1]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.compress.d.256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_maskz_compress_d_256(<8 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_compress_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcompressd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x8b,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.compress.d.256(<8 x i32> %data, <8 x i32> zeroinitializer, i8 %mask)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_compress_d_256(<8 x i32> %data) {
+; CHECK-LABEL: test_compress_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.compress.d.256(<8 x i32> %data, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.compress.d.256(<8 x i32> %data, <8 x i32> %src0, i8 %mask)
+
+define void @test_compress_store_d_256(i8* %addr, <8 x i32> %data) {
+; CHECK-LABEL: test_compress_store_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups %ymm0, (%rdi) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  call void @llvm.x86.avx512.mask.compress.store.d.256(i8* %addr, <8 x i32> %data, i8 -1)
+  ret void
+}
+
+define <4 x double> @test_mask_expand_load_pd_256(i8* %addr, <4 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_maskz_expand_load_pd_256(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vexpandpd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 %mask)
+
+define <4 x double> @test_expand_pd_256(<4 x double> %data) {
+; CHECK-LABEL: test_expand_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> undef, i8 -1)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_mask_expand_pd_256(<4 x double> %data, <4 x double> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_pd_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vexpandpd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x88,0xc8]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256( <4 x double> %data, <4 x double> %src0, i8 %mask)
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %passthru, i8 %mask)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_maskz_expand_pd_256(<4 x double> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vexpandpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x88,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> zeroinitializer, i8 %mask)
   ret <4 x double> %res
 }
 
 declare <4 x double> @llvm.x86.avx512.mask.expand.pd.256(<4 x double> %data, <4 x double> %src0, i8 %mask)
 
-define <4 x float> @expand6(<4 x float> %data, i8 %mask) {
-; CHECK-LABEL: expand6:
+define <4 x double> @test_expand_load_pd_256(i8* %addr, <4 x double> %data) {
+; CHECK-LABEL: test_expand_load_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.expand.load.pd.256(i8* %addr, <4 x double> %data, i8 -1)
+  ret <4 x double> %res
+}
+
+define <8 x float> @test_mask_expand_load_ps_256(i8* %addr, <8 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vexpandps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x88,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.expand.load.ps.256(i8* %addr, <8 x float> %data, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_maskz_expand_load_ps_256(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vexpandps (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x88,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.expand.load.ps.256(i8* %addr, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.expand.load.ps.256(i8* %addr, <8 x float> %data, i8 %mask)
+
+define <8 x float> @test_expand_ps_256(<8 x float> %data) {
+; CHECK-LABEL: test_expand_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.expand.ps.256(<8 x float> %data, <8 x float> undef, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mask_expand_ps_256(<8 x float> %data, <8 x float> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_ps_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vexpandps %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x88,0xc0]
+; CHECK-NEXT:    vexpandps %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x88,0xc8]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 %mask)
-  ret <4 x float> %res
+  %res = call <8 x float> @llvm.x86.avx512.mask.expand.ps.256(<8 x float> %data, <8 x float> %passthru, i8 %mask)
+  ret <8 x float> %res
 }
 
-declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float> %src0, i8 %mask)
+define <8 x float> @test_maskz_expand_ps_256(<8 x float> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x88,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.expand.ps.256(<8 x float> %data, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.expand.ps.256(<8 x float> %data, <8 x float> %src0, i8 %mask)
 
-define <8 x double> @expand7(i8* %addr, <8 x double> %data) {
-; CHECK-LABEL: expand7:
+define <8 x float> @test_expand_load_ps_256(i8* %addr, <8 x float> %data) {
+; CHECK-LABEL: test_expand_load_ps_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
+; CHECK-NEXT:    vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1)
-  ret <8 x double> %res
+  %res = call <8 x float> @llvm.x86.avx512.mask.expand.load.ps.256(i8* %addr, <8 x float> %data, i8 -1)
+  ret <8 x float> %res
 }
 
-define <4 x float> @expand8(<4 x float> %data) {
-; CHECK-LABEL: expand8:
+define <4 x i64> @test_mask_expand_load_q_256(i8* %addr, <4 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_q_256:
 ; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpexpandq (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x89,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x float>zeroinitializer, i8 -1)
-  ret <4 x float> %res
+  %res = call <4 x i64> @llvm.x86.avx512.mask.expand.load.q.256(i8* %addr, <4 x i64> %data, i8 %mask)
+  ret <4 x i64> %res
 }
 
-define <8 x i64> @expand9(i8* %addr, <8 x i64> %data, i8 %mask) {
-; CHECK-LABEL: expand9:
+define <4 x i64> @test_maskz_expand_load_q_256(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_q_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpexpandq (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x89,0x07]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpexpandq (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x89,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
-  ret <8 x i64> %res
+  %res = call <4 x i64> @llvm.x86.avx512.mask.expand.load.q.256(i8* %addr, <4 x i64> zeroinitializer, i8 %mask)
+  ret <4 x i64> %res
 }
 
-declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> %data, i8 %mask)
+declare <4 x i64> @llvm.x86.avx512.mask.expand.load.q.256(i8* %addr, <4 x i64> %data, i8 %mask)
 
-define <4 x i32> @expand10(<4 x i32> %data, i8 %mask) {
-; CHECK-LABEL: expand10:
+define <4 x i64> @test_expand_q_256(<4 x i64> %data) {
+; CHECK-LABEL: test_expand_q_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.expand.q.256(<4 x i64> %data, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_mask_expand_q_256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_q_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpexpandd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x89,0xc0]
+; CHECK-NEXT:    vpexpandq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x89,0xc8]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32>zeroinitializer, i8 %mask)
-  ret <4 x i32> %res
+  %res = call <4 x i64> @llvm.x86.avx512.mask.expand.q.256(<4 x i64> %data, <4 x i64> %passthru, i8 %mask)
+  ret <4 x i64> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.mask.expand.d.128(<4 x i32> %data, <4 x i32> %src0, i8 %mask)
+define <4 x i64> @test_maskz_expand_q_256(<4 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_q_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x89,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.mask.expand.q.256(<4 x i64> %data, <4 x i64> zeroinitializer, i8 %mask)
+  ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.expand.q.256(<4 x i64> %data, <4 x i64> %src0, i8 %mask)
 
-define <8 x i64> @expand11(i8* %addr) {
-; CHECK-LABEL: expand11:
+define <4 x i64> @test_expand_load_q_256(i8* %addr, <4 x i64> %data) {
+; CHECK-LABEL: test_expand_load_q_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
+; CHECK-NEXT:    vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(i8* %addr, <8 x i64> undef, i8 -1)
-  ret <8 x i64> %res
+  %res = call <4 x i64> @llvm.x86.avx512.mask.expand.load.q.256(i8* %addr, <4 x i64> %data, i8 -1)
+  ret <4 x i64> %res
 }
 
-define <8 x i64> @expand12(i8* %addr, i8 %mask) {
-; CHECK-LABEL: expand12:
+define <8 x i32> @test_mask_expand_load_d_256(i8* %addr, <8 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_load_d_256:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpexpandq (%rdi), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x89,0x07]
+; CHECK-NEXT:    vpexpandd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x89,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %laddr = bitcast i8* %addr to <8 x i64>*
-  %data = load <8 x i64>, <8 x i64>* %laddr, align 1
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64>zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
+  %res = call <8 x i32> @llvm.x86.avx512.mask.expand.load.d.256(i8* %addr, <8 x i32> %data, i8 %mask)
+  ret <8 x i32> %res
 }
 
-declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> , <8 x i64>, i8)
+define <8 x i32> @test_maskz_expand_load_d_256(i8* %addr, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_load_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpexpandd (%rdi), %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x89,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.expand.load.d.256(i8* %addr, <8 x i32> zeroinitializer, i8 %mask)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.expand.load.d.256(i8* %addr, <8 x i32> %data, i8 %mask)
+
+define <8 x i32> @test_expand_d_256(<8 x i32> %data) {
+; CHECK-LABEL: test_expand_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.expand.d.256(<8 x i32> %data, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_mask_expand_d_256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_expand_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpexpandd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x89,0xc8]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.expand.d.256(<8 x i32> %data, <8 x i32> %passthru, i8 %mask)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_maskz_expand_d_256(<8 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_maskz_expand_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x89,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.expand.d.256(<8 x i32> %data, <8 x i32> zeroinitializer, i8 %mask)
+  ret <8 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.expand.d.256(<8 x i32> %data, <8 x i32> %src0, i8 %mask)
+
+define <8 x i32> @test_expand_load_d_256(i8* %addr, <8 x i32> %data) {
+; CHECK-LABEL: test_expand_load_d_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovups (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x10,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.expand.load.d.256(i8* %addr, <8 x i32> %data, i8 -1)
+  ret <8 x i32> %res
+}
 
 define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: test_cmpps_256:




More information about the llvm-commits mailing list