[llvm] r359395 - [X86][AVX] Add AVX512DQ coverage for masked memory ops tests (PR34584)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 28 03:02:34 PDT 2019


Author: rksimon
Date: Sun Apr 28 03:02:34 2019
New Revision: 359395

URL: http://llvm.org/viewvc/llvm-project?rev=359395&view=rev
Log:
[X86][AVX] Add AVX512DQ coverage for masked memory ops tests (PR34584)

Modified:
    llvm/trunk/test/CodeGen/X86/masked_compressstore.ll
    llvm/trunk/test/CodeGen/X86/masked_expandload.ll
    llvm/trunk/test/CodeGen/X86/masked_load.ll
    llvm/trunk/test/CodeGen/X86/masked_store.ll

Modified: llvm/trunk/test/CodeGen/X86/masked_compressstore.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_compressstore.ll?rev=359395&r1=359394&r2=359395&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_compressstore.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_compressstore.ll Sun Apr 28 03:02:34 2019
@@ -4,7 +4,8 @@
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
 
 ;
 ; vXf64
@@ -266,6 +267,15 @@ define void @compressstore_v8f64_v8i1(do
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v8f64_v8i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VLDQ-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k1
+; AVX512VLDQ-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v8f64_v8i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $15, %xmm1, %xmm1
@@ -789,6 +799,33 @@ define void @compressstore_v16f64_v16i1(
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v16f64_v16i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VLDQ-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512VLDQ-NEXT:    kmovb %k1, %eax
+; AVX512VLDQ-NEXT:    movl %eax, %ecx
+; AVX512VLDQ-NEXT:    shrl %ecx
+; AVX512VLDQ-NEXT:    andl $-43, %ecx
+; AVX512VLDQ-NEXT:    subl %ecx, %eax
+; AVX512VLDQ-NEXT:    movl %eax, %ecx
+; AVX512VLDQ-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
+; AVX512VLDQ-NEXT:    shrl $2, %eax
+; AVX512VLDQ-NEXT:    andl $858993459, %eax ## imm = 0x33333333
+; AVX512VLDQ-NEXT:    addl %ecx, %eax
+; AVX512VLDQ-NEXT:    movl %eax, %ecx
+; AVX512VLDQ-NEXT:    shrl $4, %ecx
+; AVX512VLDQ-NEXT:    addl %eax, %ecx
+; AVX512VLDQ-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
+; AVX512VLDQ-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
+; AVX512VLDQ-NEXT:    shrl $24, %eax
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512VLDQ-NEXT:    vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
+; AVX512VLDQ-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v16f64_v16i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $7, %xmm2, %xmm2
@@ -919,13 +956,13 @@ define void @compressstore_v2f32_v2i32(f
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: compressstore_v2f32_v2i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VLBW-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; AVX512VLBW-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: compressstore_v2f32_v2i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vptestnmq %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   call void @llvm.masked.compressstore.v2f32(<2 x float> %V, float* %base, <2 x i1> %mask)
   ret void
@@ -1041,6 +1078,13 @@ define void @compressstore_v4f32_v4i1(fl
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v4f32_v4i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovd2m %xmm1, %k1
+; AVX512VLDQ-NEXT:    vcompressps %xmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v4f32_v4i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -1254,6 +1298,15 @@ define void @compressstore_v8f32_v8i1(fl
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v8f32_v8i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VLDQ-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k1
+; AVX512VLDQ-NEXT:    vcompressps %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v8f32_v8i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $15, %xmm1, %xmm1
@@ -1347,6 +1400,14 @@ define void @compressstore_v16f32_const(
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v16f32_const:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v16f32_const:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
@@ -2730,6 +2791,13 @@ define void @compressstore_v2i64_v2i1(i6
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v2i64_v2i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpsllq $63, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovq2m %xmm1, %k1
+; AVX512VLDQ-NEXT:    vpcompressq %xmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v2i64_v2i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllq $63, %xmm1, %xmm1
@@ -2884,6 +2952,14 @@ define void @compressstore_v4i64_v4i1(i6
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v4i64_v4i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovd2m %xmm1, %k1
+; AVX512VLDQ-NEXT:    vpcompressq %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v4i64_v4i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -3155,6 +3231,15 @@ define void @compressstore_v8i64_v8i1(i6
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v8i64_v8i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VLDQ-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k1
+; AVX512VLDQ-NEXT:    vpcompressq %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v8i64_v8i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $15, %xmm1, %xmm1
@@ -3290,11 +3375,11 @@ define void @compressstore_v4i32_v4i32(i
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: compressstore_v4i32_v4i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; AVX512VLBW-NEXT:    vpcompressd %xmm0, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: compressstore_v4i32_v4i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vpcompressd %xmm0, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   call void @llvm.masked.compressstore.v4i32(<4 x i32> %V, i32* %base, <4 x i1> %mask)
   ret void
@@ -3597,6 +3682,89 @@ define void @compressstore_v8i16_v8i16(i
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v8i16_v8i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.store
+; AVX512VLDQ-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.store1
+; AVX512VLDQ-NEXT:    vpextrw $1, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.store4
+; AVX512VLDQ-NEXT:    vpextrw $2, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_6: ## %else5
+; AVX512VLDQ-NEXT:    kshiftrb $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.store7
+; AVX512VLDQ-NEXT:    vpextrw $3, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_8: ## %else8
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.store10
+; AVX512VLDQ-NEXT:    vpextrw $4, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_10: ## %else11
+; AVX512VLDQ-NEXT:    kshiftrb $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.store13
+; AVX512VLDQ-NEXT:    vpextrw $5, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_12: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.store16
+; AVX512VLDQ-NEXT:    vpextrw $6, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_14: ## %else17
+; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.store19
+; AVX512VLDQ-NEXT:    vpextrw $7, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:  LBB11_16: ## %else20
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v8i16_v8i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmw %xmm1, %xmm1, %k0
@@ -4249,6 +4417,169 @@ define void @compressstore_v16i8_v16i8(i
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: compressstore_v16i8_v16i8:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.store
+; AVX512VLDQ-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.store1
+; AVX512VLDQ-NEXT:    vpextrb $1, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.store4
+; AVX512VLDQ-NEXT:    vpextrb $2, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_6: ## %else5
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.store7
+; AVX512VLDQ-NEXT:    vpextrb $3, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_8: ## %else8
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.store10
+; AVX512VLDQ-NEXT:    vpextrb $4, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_10: ## %else11
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.store13
+; AVX512VLDQ-NEXT:    vpextrb $5, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_12: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.store16
+; AVX512VLDQ-NEXT:    vpextrb $6, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_14: ## %else17
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.store19
+; AVX512VLDQ-NEXT:    vpextrb $7, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_16: ## %else20
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.store22
+; AVX512VLDQ-NEXT:    vpextrb $8, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_18: ## %else23
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.store25
+; AVX512VLDQ-NEXT:    vpextrb $9, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_20: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.store28
+; AVX512VLDQ-NEXT:    vpextrb $10, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_22: ## %else29
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.store31
+; AVX512VLDQ-NEXT:    vpextrb $11, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_24: ## %else32
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.store34
+; AVX512VLDQ-NEXT:    vpextrb $12, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_26: ## %else35
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.store37
+; AVX512VLDQ-NEXT:    vpextrb $13, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_28: ## %else38
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.store40
+; AVX512VLDQ-NEXT:    vpextrb $14, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_30: ## %else41
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.store43
+; AVX512VLDQ-NEXT:    vpextrb $15, %xmm0, (%rdi)
+; AVX512VLDQ-NEXT:  LBB12_32: ## %else44
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: compressstore_v16i8_v16i8:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmb %xmm1, %xmm1, %k0

Modified: llvm/trunk/test/CodeGen/X86/masked_expandload.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_expandload.ll?rev=359395&r1=359394&r2=359395&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_expandload.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_expandload.ll Sun Apr 28 03:02:34 2019
@@ -4,7 +4,8 @@
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
 
 ;
 ; vXf64
@@ -82,11 +83,11 @@ define <2 x double> @expandload_v2f64_v2
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: expandload_v2f64_v2i64:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; AVX512VLBW-NEXT:    vexpandpd (%rdi), %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: expandload_v2f64_v2i64:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmq %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vexpandpd (%rdi), %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
   %res = call <2 x double> @llvm.masked.expandload.v2f64(double* %base, <2 x i1> %mask, <2 x double> %src0)
   ret <2 x double>%res
@@ -263,11 +264,11 @@ define <4 x double> @expandload_v4f64_v4
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: expandload_v4f64_v4i64:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmq %ymm1, %ymm1, %k1
-; AVX512VLBW-NEXT:    vexpandpd (%rdi), %ymm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: expandload_v4f64_v4i64:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmq %ymm1, %ymm1, %k1
+; AVX512VL-NEXT:    vexpandpd (%rdi), %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i64> %trigger, zeroinitializer
   %res = call <4 x double> @llvm.masked.expandload.v4f64(double* %base, <4 x i1> %mask, <4 x double> %src0)
   ret <4 x double>%res
@@ -471,6 +472,14 @@ define <8 x double> @expandload_v8f64_v8
 ; AVX512F-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v8f64_v8i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512VLDQ-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k1
+; AVX512VLDQ-NEXT:    vexpandpd (%rdi), %zmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v8f64_v8i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $15, %xmm1, %xmm1
@@ -1180,6 +1189,31 @@ define <16 x double> @expandload_v16f64_
 ; AVX512F-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v16f64_v16i32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512VLDQ-NEXT:    vptestnmd %ymm3, %ymm3, %k1
+; AVX512VLDQ-NEXT:    vptestnmd %ymm2, %ymm2, %k2
+; AVX512VLDQ-NEXT:    kmovb %k2, %eax
+; AVX512VLDQ-NEXT:    movl %eax, %ecx
+; AVX512VLDQ-NEXT:    shrl %ecx
+; AVX512VLDQ-NEXT:    andl $-43, %ecx
+; AVX512VLDQ-NEXT:    subl %ecx, %eax
+; AVX512VLDQ-NEXT:    movl %eax, %ecx
+; AVX512VLDQ-NEXT:    andl $858993459, %ecx ## imm = 0x33333333
+; AVX512VLDQ-NEXT:    shrl $2, %eax
+; AVX512VLDQ-NEXT:    andl $858993459, %eax ## imm = 0x33333333
+; AVX512VLDQ-NEXT:    addl %ecx, %eax
+; AVX512VLDQ-NEXT:    movl %eax, %ecx
+; AVX512VLDQ-NEXT:    shrl $4, %ecx
+; AVX512VLDQ-NEXT:    addl %eax, %ecx
+; AVX512VLDQ-NEXT:    andl $252645135, %ecx ## imm = 0xF0F0F0F
+; AVX512VLDQ-NEXT:    imull $16843009, %ecx, %eax ## imm = 0x1010101
+; AVX512VLDQ-NEXT:    shrl $24, %eax
+; AVX512VLDQ-NEXT:    vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
+; AVX512VLDQ-NEXT:    vexpandpd (%rdi), %zmm0 {%k2}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v16f64_v16i32:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
@@ -1317,13 +1351,13 @@ define <2 x float> @expandload_v2f32_v2i
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: expandload_v2f32_v2i1:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
-; AVX512VLBW-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; AVX512VLBW-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: expandload_v2f32_v2i1:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
+; AVX512VL-NEXT:    vptestnmq %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x float> @llvm.masked.expandload.v2f32(float* %base, <2 x i1> %mask, <2 x float> %src0)
   ret <2 x float> %res
@@ -1367,6 +1401,13 @@ define <4 x float> @expandload_v4f32_con
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v4f32_const:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $7, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v4f32_const:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $7, %al
@@ -1444,6 +1485,13 @@ define <16 x float> @expandload_v16f32_c
 ; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v16f32_const:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movw $30719, %ax ## imm = 0x77FF
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v16f32_const:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movw $30719, %ax ## imm = 0x77FF
@@ -1489,6 +1537,13 @@ define <16 x float> @expandload_v16f32_c
 ; AVX512F-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v16f32_const_undef:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v16f32_const_undef:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movw $-2049, %ax ## imm = 0xF7FF
@@ -2954,6 +3009,13 @@ define <2 x i64> @expandload_v2i64_const
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v2i64_const:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $2, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vpexpandq (%rdi), %xmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v2i64_const:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $2, %al
@@ -3094,11 +3156,11 @@ define <4 x i32> @expandload_v4i32_v4i32
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: expandload_v4i32_v4i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %xmm1, %xmm1, %k1
-; AVX512VLBW-NEXT:    vpexpandd (%rdi), %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: expandload_v4i32_v4i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %xmm1, %xmm1, %k1
+; AVX512VL-NEXT:    vpexpandd (%rdi), %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x i32> @llvm.masked.expandload.v4i32(i32* %base, <4 x i1> %mask, <4 x i32> %src0)
   ret <4 x i32>%res
@@ -3393,6 +3455,89 @@ define <8 x i16> @expandload_v8i16_v8i16
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v8i16_v8i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.load
+; AVX512VLDQ-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.load1
+; AVX512VLDQ-NEXT:    vpinsrw $1, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.load5
+; AVX512VLDQ-NEXT:    vpinsrw $2, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_6: ## %else6
+; AVX512VLDQ-NEXT:    kshiftrb $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.load9
+; AVX512VLDQ-NEXT:    vpinsrw $3, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_8: ## %else10
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.load13
+; AVX512VLDQ-NEXT:    vpinsrw $4, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_10: ## %else14
+; AVX512VLDQ-NEXT:    kshiftrb $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.load17
+; AVX512VLDQ-NEXT:    vpinsrw $5, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_12: ## %else18
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm1, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.load21
+; AVX512VLDQ-NEXT:    vpinsrw $6, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    addq $2, %rdi
+; AVX512VLDQ-NEXT:  LBB11_14: ## %else22
+; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB11_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.load25
+; AVX512VLDQ-NEXT:    vpinsrw $7, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:  LBB11_16: ## %else26
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v8i16_v8i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmw %xmm1, %xmm1, %k0
@@ -4120,6 +4265,169 @@ define <16 x i8> @expandload_v16i8_v16i8
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: expandload_v16i8_v16i8:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.load
+; AVX512VLDQ-NEXT:    vpinsrb $0, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.load1
+; AVX512VLDQ-NEXT:    vpinsrb $1, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.load5
+; AVX512VLDQ-NEXT:    vpinsrb $2, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_6: ## %else6
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.load9
+; AVX512VLDQ-NEXT:    vpinsrb $3, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_8: ## %else10
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.load13
+; AVX512VLDQ-NEXT:    vpinsrb $4, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_10: ## %else14
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.load17
+; AVX512VLDQ-NEXT:    vpinsrb $5, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_12: ## %else18
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.load21
+; AVX512VLDQ-NEXT:    vpinsrb $6, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_14: ## %else22
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.load25
+; AVX512VLDQ-NEXT:    vpinsrb $7, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_16: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.load29
+; AVX512VLDQ-NEXT:    vpinsrb $8, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_18: ## %else30
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.load33
+; AVX512VLDQ-NEXT:    vpinsrb $9, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_20: ## %else34
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.load37
+; AVX512VLDQ-NEXT:    vpinsrb $10, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_22: ## %else38
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.load41
+; AVX512VLDQ-NEXT:    vpinsrb $11, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_24: ## %else42
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.load45
+; AVX512VLDQ-NEXT:    vpinsrb $12, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_26: ## %else46
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.load49
+; AVX512VLDQ-NEXT:    vpinsrb $13, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_28: ## %else50
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm1, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.load53
+; AVX512VLDQ-NEXT:    vpinsrb $14, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    incq %rdi
+; AVX512VLDQ-NEXT:  LBB12_30: ## %else54
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB12_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.load57
+; AVX512VLDQ-NEXT:    vpinsrb $15, (%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:  LBB12_32: ## %else58
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: expandload_v16i8_v16i8:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmb %xmm1, %xmm1, %k0

Modified: llvm/trunk/test/CodeGen/X86/masked_load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_load.ll?rev=359395&r1=359394&r2=359395&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_load.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_load.ll Sun Apr 28 03:02:34 2019
@@ -4,7 +4,8 @@
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
 ; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
 
 ;
 ; vXf64
@@ -94,11 +95,11 @@ define <2 x double> @load_v2f64_v2i64(<2
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v2f64_v2i64:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v2f64_v2i64:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
   %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
   ret <2 x double> %res
@@ -205,11 +206,11 @@ define <4 x double> @load_v4f64_v4i32(<4
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v4f64_v4i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v4f64_v4i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double> %dst)
   ret <4 x double> %res
@@ -317,11 +318,11 @@ define <4 x double> @load_v4f64_v4i32_ze
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v4f64_v4i32_zero:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v4f64_v4i32_zero:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovapd (%rdi), %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer)
   ret <4 x double> %res
@@ -430,11 +431,11 @@ define <4 x double> @load_v4f64_v4i64(<4
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v4f64_v4i64:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; AVX512VLBW-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v4f64_v4i64:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i64> %trigger, zeroinitializer
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> %mask, <4 x double> %dst)
   ret <4 x double> %res
@@ -614,6 +615,15 @@ define <8 x double> @load_v8f64_v8i16(<8
 ; AVX512F-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v8f64_v8i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512VLDQ-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v8f64_v8i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmw %xmm0, %xmm0, %k1
@@ -888,13 +898,13 @@ define <2 x float> @load_v2f32_v2i32(<2
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v2f32_v2i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v2f32_v2i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
   ret <2 x float> %res
@@ -977,13 +987,13 @@ define <2 x float> @load_v2f32_v2i32_und
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v2f32_v2i32_undef:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v2f32_v2i32_undef:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float>undef)
   ret <2 x float> %res
@@ -1085,11 +1095,11 @@ define <4 x float> @load_v4f32_v4i32(<4
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v4f32_v4i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v4f32_v4i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vblendmps (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> %mask, <4 x float> %dst)
   ret <4 x float> %res
@@ -1253,6 +1263,14 @@ define <8 x float> @load_v8f32_v8i1_zero
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v8f32_v8i1_zero:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512VLDQ-NEXT:    vmovaps (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v8f32_v8i1_zero:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
@@ -1440,11 +1458,11 @@ define <8 x float> @load_v8f32_v8i32(<8
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v8f32_v8i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; AVX512VLBW-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v8f32_v8i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vblendmps (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> %dst)
   ret <8 x float> %res
@@ -1552,11 +1570,11 @@ define <2 x i64> @load_v2i64_v2i64(<2 x
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v2i64_v2i64:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vpblendmq (%rdi), %xmm1, %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v2i64_v2i64:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vpblendmq (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i64> %trigger, zeroinitializer
   %res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst)
   ret <2 x i64> %res
@@ -1667,11 +1685,11 @@ define <4 x i64> @load_v4i64_v4i64(<4 x
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v4i64_v4i64:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmq %ymm0, %ymm0, %k1
-; AVX512VLBW-NEXT:    vpblendmq (%rdi), %ymm1, %ymm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v4i64_v4i64:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmq %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vpblendmq (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i64> %trigger, zeroinitializer
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst)
   ret <4 x i64> %res
@@ -1855,6 +1873,15 @@ define <8 x i64> @load_v8i64_v8i16(<8 x
 ; AVX512F-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v8i64_v8i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512VLDQ-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v8i64_v8i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmw %xmm0, %xmm0, %k1
@@ -2138,15 +2165,15 @@ define <2 x i32> @load_v2i32_v2i32(<2 x
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v2i32_v2i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; AVX512VLBW-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
-; AVX512VLBW-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v2i32_v2i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX512VL-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
+; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
   ret <2 x i32> %res
@@ -2255,11 +2282,11 @@ define <4 x i32> @load_v4i32_v4i32(<4 x
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: load_v4i32_v4i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: load_v4i32_v4i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
   ret <4 x i32> %res
@@ -2422,6 +2449,14 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v8i32_v8i1:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512VLDQ-NEXT:    vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v8i32_v8i1:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
@@ -2589,6 +2624,14 @@ define <8 x i32> @load_v8i32_v8i1_zero(<
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v8i32_v8i1_zero:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512VLDQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k1
+; AVX512VLDQ-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v8i32_v8i1_zero:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpsllw $15, %xmm0, %xmm0
@@ -2864,6 +2907,83 @@ define <8 x i16> @load_v8i16_v8i16(<8 x
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v8i16_v8i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.load
+; AVX512VLDQ-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.load1
+; AVX512VLDQ-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.load4
+; AVX512VLDQ-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_6: ## %else5
+; AVX512VLDQ-NEXT:    kshiftrb $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.load7
+; AVX512VLDQ-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_8: ## %else8
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.load10
+; AVX512VLDQ-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_10: ## %else11
+; AVX512VLDQ-NEXT:    kshiftrb $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.load13
+; AVX512VLDQ-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_12: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.load16
+; AVX512VLDQ-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_14: ## %else17
+; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB21_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.load19
+; AVX512VLDQ-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB21_16: ## %else20
+; AVX512VLDQ-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v8i16_v8i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpmovw2m %xmm0, %k1
@@ -3574,6 +3694,178 @@ define <16 x i16> @load_v16i16_v16i16(<1
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v16i16_v16i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.load
+; AVX512VLDQ-NEXT:    vpinsrw $0, (%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.load1
+; AVX512VLDQ-NEXT:    vpinsrw $1, 2(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.load4
+; AVX512VLDQ-NEXT:    vpinsrw $2, 4(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_6: ## %else5
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.load7
+; AVX512VLDQ-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_8: ## %else8
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.load10
+; AVX512VLDQ-NEXT:    vpinsrw $4, 8(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_10: ## %else11
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.load13
+; AVX512VLDQ-NEXT:    vpinsrw $5, 10(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_12: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.load16
+; AVX512VLDQ-NEXT:    vpinsrw $6, 12(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_14: ## %else17
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.load19
+; AVX512VLDQ-NEXT:    vpinsrw $7, 14(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB22_16: ## %else20
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.load22
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrw $0, 16(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_18: ## %else23
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.load25
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrw $1, 18(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_20: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.load28
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrw $2, 20(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_22: ## %else29
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.load31
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrw $3, 22(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_24: ## %else32
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.load34
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrw $4, 24(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_26: ## %else35
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.load37
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrw $5, 26(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_28: ## %else38
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.load40
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_30: ## %else41
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB22_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.load43
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB22_32: ## %else44
+; AVX512VLDQ-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v16i16_v16i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpmovw2m %ymm0, %k1
@@ -4184,6 +4476,155 @@ define <16 x i8> @load_v16i8_v16i8(<16 x
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v16i8_v16i8:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.load
+; AVX512VLDQ-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.load1
+; AVX512VLDQ-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.load4
+; AVX512VLDQ-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_6: ## %else5
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.load7
+; AVX512VLDQ-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_8: ## %else8
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.load10
+; AVX512VLDQ-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_10: ## %else11
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.load13
+; AVX512VLDQ-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_12: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.load16
+; AVX512VLDQ-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_14: ## %else17
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.load19
+; AVX512VLDQ-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_16: ## %else20
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.load22
+; AVX512VLDQ-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_18: ## %else23
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.load25
+; AVX512VLDQ-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_20: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.load28
+; AVX512VLDQ-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_22: ## %else29
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.load31
+; AVX512VLDQ-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_24: ## %else32
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.load34
+; AVX512VLDQ-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_26: ## %else35
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.load37
+; AVX512VLDQ-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_28: ## %else38
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.load40
+; AVX512VLDQ-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_30: ## %else41
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB23_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.load43
+; AVX512VLDQ-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm1
+; AVX512VLDQ-NEXT:  LBB23_32: ## %else44
+; AVX512VLDQ-NEXT:    vmovdqa %xmm1, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v16i8_v16i8:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpmovb2m %xmm0, %k1
@@ -5779,6 +6220,369 @@ define <32 x i8> @load_v32i8_v32i8(<32 x
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: load_v32i8_v32i8:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.load
+; AVX512VLDQ-NEXT:    vpinsrb $0, (%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_2: ## %else
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.load1
+; AVX512VLDQ-NEXT:    vpinsrb $1, 1(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.load4
+; AVX512VLDQ-NEXT:    vpinsrb $2, 2(%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_6: ## %else5
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.load7
+; AVX512VLDQ-NEXT:    vpinsrb $3, 3(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_8: ## %else8
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.load10
+; AVX512VLDQ-NEXT:    vpinsrb $4, 4(%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_10: ## %else11
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.load13
+; AVX512VLDQ-NEXT:    vpinsrb $5, 5(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_12: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.load16
+; AVX512VLDQ-NEXT:    vpinsrb $6, 6(%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_14: ## %else17
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.load19
+; AVX512VLDQ-NEXT:    vpinsrb $7, 7(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_16: ## %else20
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.load22
+; AVX512VLDQ-NEXT:    vpinsrb $8, 8(%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_18: ## %else23
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.load25
+; AVX512VLDQ-NEXT:    vpinsrb $9, 9(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_20: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.load28
+; AVX512VLDQ-NEXT:    vpinsrb $10, 10(%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_22: ## %else29
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.load31
+; AVX512VLDQ-NEXT:    vpinsrb $11, 11(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_24: ## %else32
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.load34
+; AVX512VLDQ-NEXT:    vpinsrb $12, 12(%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_26: ## %else35
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.load37
+; AVX512VLDQ-NEXT:    vpinsrb $13, 13(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_28: ## %else38
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.load40
+; AVX512VLDQ-NEXT:    vpinsrb $14, 14(%rdi), %xmm1, %xmm3
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_30: ## %else41
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.load43
+; AVX512VLDQ-NEXT:    vpinsrb $15, 15(%rdi), %xmm1, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512VLDQ-NEXT:  LBB24_32: ## %else44
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_34
+; AVX512VLDQ-NEXT:  ## %bb.33: ## %cond.load46
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $0, 16(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_34: ## %else47
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_36
+; AVX512VLDQ-NEXT:  ## %bb.35: ## %cond.load49
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $1, 17(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_36: ## %else50
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_38
+; AVX512VLDQ-NEXT:  ## %bb.37: ## %cond.load52
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $2, 18(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_38: ## %else53
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_40
+; AVX512VLDQ-NEXT:  ## %bb.39: ## %cond.load55
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $3, 19(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_40: ## %else56
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_42
+; AVX512VLDQ-NEXT:  ## %bb.41: ## %cond.load58
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $4, 20(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_42: ## %else59
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_44
+; AVX512VLDQ-NEXT:  ## %bb.43: ## %cond.load61
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $5, 21(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_44: ## %else62
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_46
+; AVX512VLDQ-NEXT:  ## %bb.45: ## %cond.load64
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $6, 22(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_46: ## %else65
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_48
+; AVX512VLDQ-NEXT:  ## %bb.47: ## %cond.load67
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $7, 23(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_48: ## %else68
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_50
+; AVX512VLDQ-NEXT:  ## %bb.49: ## %cond.load70
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $8, 24(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_50: ## %else71
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_52
+; AVX512VLDQ-NEXT:  ## %bb.51: ## %cond.load73
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $9, 25(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_52: ## %else74
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_54
+; AVX512VLDQ-NEXT:  ## %bb.53: ## %cond.load76
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $10, 26(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_54: ## %else77
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_56
+; AVX512VLDQ-NEXT:  ## %bb.55: ## %cond.load79
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $11, 27(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_56: ## %else80
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_58
+; AVX512VLDQ-NEXT:  ## %bb.57: ## %cond.load82
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $12, 28(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_58: ## %else83
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_60
+; AVX512VLDQ-NEXT:  ## %bb.59: ## %cond.load85
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpinsrb $13, 29(%rdi), %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_60: ## %else86
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_62
+; AVX512VLDQ-NEXT:  ## %bb.61: ## %cond.load88
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpinsrb $14, 30(%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_62: ## %else89
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB24_64
+; AVX512VLDQ-NEXT:  ## %bb.63: ## %cond.load91
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpinsrb $15, 31(%rdi), %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:  LBB24_64: ## %else92
+; AVX512VLDQ-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: load_v32i8_v32i8:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpmovb2m %ymm0, %k1
@@ -5829,6 +6633,13 @@ define <4 x float> @mload_constmask_v4f3
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v4f32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $13, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovups (%rdi), %xmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v4f32:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $13, %al
@@ -5859,11 +6670,11 @@ define <4 x float> @mload_constmask_v4f3
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: mload_constmask_v4f32_all:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512VLBW-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: mload_constmask_v4f32_all:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512VL-NEXT:    vmovups (%rdi), %xmm0 {%k1} {z}
+; AVX512VL-NEXT:    retq
   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
   ret <4 x float> %res
 }
@@ -5929,6 +6740,13 @@ define <4 x i32> @mload_constmask_v4i32(
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v4i32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $14, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v4i32:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $14, %al
@@ -5997,6 +6815,13 @@ define <8 x float> @mload_constmask_v8f3
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v8f32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $7, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovups (%rdi), %ymm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v8f32:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $7, %al
@@ -6030,6 +6855,13 @@ define <4 x double> @mload_constmask_v4f
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v4f64:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $7, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v4f64:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $7, %al
@@ -6079,6 +6911,13 @@ define <8 x i32> @mload_constmask_v8i32(
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v8i32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $-121, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v8i32:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $-121, %al
@@ -6117,6 +6956,13 @@ define <4 x i64> @mload_constmask_v4i64(
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v4i64:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $9, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v4i64:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $9, %al
@@ -6150,6 +6996,13 @@ define <8 x double> @mload_constmask_v8f
 ; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v8f64:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $-121, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v8f64:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $-121, %al
@@ -6183,6 +7036,13 @@ define <4 x double> @mload_constmask_v4f
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v4f64_undef_passthrough:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $7, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v4f64_undef_passthrough:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $7, %al
@@ -6221,6 +7081,13 @@ define <4 x i64> @mload_constmask_v4i64_
 ; AVX512F-NEXT:    ## kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: mload_constmask_v4i64_undef_passthrough:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $6, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: mload_constmask_v4i64_undef_passthrough:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    movb $6, %al

Modified: llvm/trunk/test/CodeGen/X86/masked_store.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_store.ll?rev=359395&r1=359394&r2=359395&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_store.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_store.ll Sun Apr 28 03:02:34 2019
@@ -4,7 +4,8 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx     | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2    | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
 
 ;
 ; vXf64
@@ -94,6 +95,12 @@ define void @store_v2f64_v2i64(<2 x i64>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v2f64_v2i64:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovq2m %xmm0, %k1
+; AVX512VLDQ-NEXT:    vmovupd %xmm1, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v2f64_v2i64:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -201,6 +208,13 @@ define void @store_v4f64_v4i64(<4 x i64>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v4f64_v4i64:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovq2m %ymm0, %k1
+; AVX512VLDQ-NEXT:    vmovupd %ymm1, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v4f64_v4i64:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -289,21 +303,18 @@ define void @store_v2f32_v2i32(<2 x i32>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: store_v2f32_v2i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vmovups %xmm1, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: store_v2f32_v2i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovups %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
   ret void
 }
 
-; PR34584: The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
-; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
-
 define void @store_v4f32_v4i32(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
 ; SSE2-LABEL: store_v4f32_v4i32:
 ; SSE2:       ## %bb.0:
@@ -391,6 +402,12 @@ define void @store_v4f32_v4i32(<4 x floa
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v4f32_v4i32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovd2m %xmm2, %k1
+; AVX512VLDQ-NEXT:    vmovups %xmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v4f32_v4i32:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -555,6 +572,13 @@ define void @store_v8f32_v8i32(<8 x floa
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v8f32_v8i32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k1
+; AVX512VLDQ-NEXT:    vmovups %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v8f32_v8i32:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
@@ -835,13 +859,28 @@ define void @store_v16f32_v16i32(<16 x f
 ; AVX1OR2-NEXT:    vzeroupper
 ; AVX1OR2-NEXT:    retq
 ;
-; AVX512-LABEL: store_v16f32_v16i32:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpcmpgtd %zmm2, %zmm1, %k1
-; AVX512-NEXT:    vmovups %zmm0, (%rdi) {%k1}
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: store_v16f32_v16i32:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm1, %k1
+; AVX512F-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: store_v16f32_v16i32:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k1
+; AVX512VLDQ-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
+; AVX512VLBW-LABEL: store_v16f32_v16i32:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VLBW-NEXT:    vpcmpgtd %zmm2, %zmm1, %k1
+; AVX512VLBW-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; AVX512VLBW-NEXT:    vzeroupper
+; AVX512VLBW-NEXT:    retq
   %bool_mask = icmp slt <16 x i32> %mask, zeroinitializer
   call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %x, <16 x float>* %ptr, i32 1, <16 x i1> %bool_mask)
   ret void
@@ -918,6 +957,12 @@ define void @store_v2i64_v2i64(<2 x i64>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v2i64_v2i64:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovq2m %xmm0, %k1
+; AVX512VLDQ-NEXT:    vmovdqu64 %xmm1, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v2i64_v2i64:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -1033,6 +1078,13 @@ define void @store_v4i64_v4i64(<4 x i64>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v4i64_v4i64:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpmovq2m %ymm0, %k1
+; AVX512VLDQ-NEXT:    vmovdqu64 %ymm1, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v4i64_v4i64:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -1146,13 +1198,13 @@ define void @store_v2i32_v2i32(<2 x i32>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: store_v2i32_v2i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512VLBW-NEXT:    vptestnmq %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: store_v2i32_v2i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
+; AVX512VL-NEXT:    vptestnmq %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vpmovqd %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <2 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
   ret void
@@ -1251,11 +1303,11 @@ define void @store_v4i32_v4i32(<4 x i32>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: store_v4i32_v4i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %xmm0, %xmm0, %k1
-; AVX512VLBW-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: store_v4i32_v4i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
   ret void
@@ -1422,12 +1474,12 @@ define void @store_v8i32_v8i32(<8 x i32>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: store_v8i32_v8i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k1
-; AVX512VLBW-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    vzeroupper
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: store_v8i32_v8i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %val, <8 x i32>* %addr, i32 4, <8 x i1> %mask)
   ret void
@@ -1702,6 +1754,82 @@ define void @store_v8i16_v8i16(<8 x i16>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v8i16_v8i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.store
+; AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, (%rdi)
+; AVX512VLDQ-NEXT:  LBB13_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.store1
+; AVX512VLDQ-NEXT:    vpextrw $1, %xmm1, 2(%rdi)
+; AVX512VLDQ-NEXT:  LBB13_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.store3
+; AVX512VLDQ-NEXT:    vpextrw $2, %xmm1, 4(%rdi)
+; AVX512VLDQ-NEXT:  LBB13_6: ## %else4
+; AVX512VLDQ-NEXT:    kshiftrb $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.store5
+; AVX512VLDQ-NEXT:    vpextrw $3, %xmm1, 6(%rdi)
+; AVX512VLDQ-NEXT:  LBB13_8: ## %else6
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm2, %ymm2
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.store7
+; AVX512VLDQ-NEXT:    vpextrw $4, %xmm1, 8(%rdi)
+; AVX512VLDQ-NEXT:  LBB13_10: ## %else8
+; AVX512VLDQ-NEXT:    kshiftrb $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.store9
+; AVX512VLDQ-NEXT:    vpextrw $5, %xmm1, 10(%rdi)
+; AVX512VLDQ-NEXT:  LBB13_12: ## %else10
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovd2m %ymm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrb $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.store11
+; AVX512VLDQ-NEXT:    vpextrw $6, %xmm1, 12(%rdi)
+; AVX512VLDQ-NEXT:  LBB13_14: ## %else12
+; AVX512VLDQ-NEXT:    kshiftrb $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB13_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.store13
+; AVX512VLDQ-NEXT:    vpextrw $7, %xmm1, 14(%rdi)
+; AVX512VLDQ-NEXT:  LBB13_16: ## %else14
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v8i16_v8i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmw %xmm0, %xmm0, %k1
@@ -2376,6 +2504,162 @@ define void @store_v16i16_v16i16(<16 x i
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v16i16_v16i16:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.store
+; AVX512VLDQ-NEXT:    vpextrw $0, %xmm1, (%rdi)
+; AVX512VLDQ-NEXT:  LBB14_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.store1
+; AVX512VLDQ-NEXT:    vpextrw $1, %xmm1, 2(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.store3
+; AVX512VLDQ-NEXT:    vpextrw $2, %xmm1, 4(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_6: ## %else4
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.store5
+; AVX512VLDQ-NEXT:    vpextrw $3, %xmm1, 6(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_8: ## %else6
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.store7
+; AVX512VLDQ-NEXT:    vpextrw $4, %xmm1, 8(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_10: ## %else8
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.store9
+; AVX512VLDQ-NEXT:    vpextrw $5, %xmm1, 10(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_12: ## %else10
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.store11
+; AVX512VLDQ-NEXT:    vpextrw $6, %xmm1, 12(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_14: ## %else12
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.store13
+; AVX512VLDQ-NEXT:    vpextrw $7, %xmm1, 14(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_16: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.store15
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrw $0, %xmm2, 16(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_18: ## %else16
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.store17
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrw $1, %xmm2, 18(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_20: ## %else18
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.store19
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrw $2, %xmm2, 20(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_22: ## %else20
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.store21
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrw $3, %xmm2, 22(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_24: ## %else22
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.store23
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrw $4, %xmm2, 24(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_26: ## %else24
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.store25
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrw $5, %xmm2, 26(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_28: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.store27
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpextrw $6, %xmm0, 28(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_30: ## %else28
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB14_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.store29
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpextrw $7, %xmm0, 30(%rdi)
+; AVX512VLDQ-NEXT:  LBB14_32: ## %else30
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v16i16_v16i16:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmw %ymm0, %ymm0, %k1
@@ -2908,6 +3192,154 @@ define void @store_v16i8_v16i8(<16 x i8>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v16i8_v16i8:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.store
+; AVX512VLDQ-NEXT:    vpextrb $0, %xmm1, (%rdi)
+; AVX512VLDQ-NEXT:  LBB15_2: ## %else
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.store1
+; AVX512VLDQ-NEXT:    vpextrb $1, %xmm1, 1(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.store3
+; AVX512VLDQ-NEXT:    vpextrb $2, %xmm1, 2(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_6: ## %else4
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.store5
+; AVX512VLDQ-NEXT:    vpextrb $3, %xmm1, 3(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_8: ## %else6
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.store7
+; AVX512VLDQ-NEXT:    vpextrb $4, %xmm1, 4(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_10: ## %else8
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.store9
+; AVX512VLDQ-NEXT:    vpextrb $5, %xmm1, 5(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_12: ## %else10
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.store11
+; AVX512VLDQ-NEXT:    vpextrb $6, %xmm1, 6(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_14: ## %else12
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.store13
+; AVX512VLDQ-NEXT:    vpextrb $7, %xmm1, 7(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_16: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.store15
+; AVX512VLDQ-NEXT:    vpextrb $8, %xmm1, 8(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_18: ## %else16
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.store17
+; AVX512VLDQ-NEXT:    vpextrb $9, %xmm1, 9(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_20: ## %else18
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.store19
+; AVX512VLDQ-NEXT:    vpextrb $10, %xmm1, 10(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_22: ## %else20
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.store21
+; AVX512VLDQ-NEXT:    vpextrb $11, %xmm1, 11(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_24: ## %else22
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.store23
+; AVX512VLDQ-NEXT:    vpextrb $12, %xmm1, 12(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_26: ## %else24
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.store25
+; AVX512VLDQ-NEXT:    vpextrb $13, %xmm1, 13(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_28: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.store27
+; AVX512VLDQ-NEXT:    vpextrb $14, %xmm1, 14(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_30: ## %else28
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB15_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.store29
+; AVX512VLDQ-NEXT:    vpextrb $15, %xmm1, 15(%rdi)
+; AVX512VLDQ-NEXT:  LBB15_32: ## %else30
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v16i8_v16i8:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmb %xmm0, %xmm0, %k1
@@ -4253,6 +4685,337 @@ define void @store_v32i8_v32i8(<32 x i8>
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: store_v32i8_v32i8:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_2
+; AVX512VLDQ-NEXT:  ## %bb.1: ## %cond.store
+; AVX512VLDQ-NEXT:    vpextrb $0, %xmm1, (%rdi)
+; AVX512VLDQ-NEXT:  LBB16_2: ## %else
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_4
+; AVX512VLDQ-NEXT:  ## %bb.3: ## %cond.store1
+; AVX512VLDQ-NEXT:    vpextrb $1, %xmm1, 1(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_4: ## %else2
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_6
+; AVX512VLDQ-NEXT:  ## %bb.5: ## %cond.store3
+; AVX512VLDQ-NEXT:    vpextrb $2, %xmm1, 2(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_6: ## %else4
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_8
+; AVX512VLDQ-NEXT:  ## %bb.7: ## %cond.store5
+; AVX512VLDQ-NEXT:    vpextrb $3, %xmm1, 3(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_8: ## %else6
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_10
+; AVX512VLDQ-NEXT:  ## %bb.9: ## %cond.store7
+; AVX512VLDQ-NEXT:    vpextrb $4, %xmm1, 4(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_10: ## %else8
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_12
+; AVX512VLDQ-NEXT:  ## %bb.11: ## %cond.store9
+; AVX512VLDQ-NEXT:    vpextrb $5, %xmm1, 5(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_12: ## %else10
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_14
+; AVX512VLDQ-NEXT:  ## %bb.13: ## %cond.store11
+; AVX512VLDQ-NEXT:    vpextrb $6, %xmm1, 6(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_14: ## %else12
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_16
+; AVX512VLDQ-NEXT:  ## %bb.15: ## %cond.store13
+; AVX512VLDQ-NEXT:    vpextrb $7, %xmm1, 7(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_16: ## %else14
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_18
+; AVX512VLDQ-NEXT:  ## %bb.17: ## %cond.store15
+; AVX512VLDQ-NEXT:    vpextrb $8, %xmm1, 8(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_18: ## %else16
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_20
+; AVX512VLDQ-NEXT:  ## %bb.19: ## %cond.store17
+; AVX512VLDQ-NEXT:    vpextrb $9, %xmm1, 9(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_20: ## %else18
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_22
+; AVX512VLDQ-NEXT:  ## %bb.21: ## %cond.store19
+; AVX512VLDQ-NEXT:    vpextrb $10, %xmm1, 10(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_22: ## %else20
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_24
+; AVX512VLDQ-NEXT:  ## %bb.23: ## %cond.store21
+; AVX512VLDQ-NEXT:    vpextrb $11, %xmm1, 11(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_24: ## %else22
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_26
+; AVX512VLDQ-NEXT:  ## %bb.25: ## %cond.store23
+; AVX512VLDQ-NEXT:    vpextrb $12, %xmm1, 12(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_26: ## %else24
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_28
+; AVX512VLDQ-NEXT:  ## %bb.27: ## %cond.store25
+; AVX512VLDQ-NEXT:    vpextrb $13, %xmm1, 13(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_28: ## %else26
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm3
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm3, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_30
+; AVX512VLDQ-NEXT:  ## %bb.29: ## %cond.store27
+; AVX512VLDQ-NEXT:    vpextrb $14, %xmm1, 14(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_30: ## %else28
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_32
+; AVX512VLDQ-NEXT:  ## %bb.31: ## %cond.store29
+; AVX512VLDQ-NEXT:    vpextrb $15, %xmm1, 15(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_32: ## %else30
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_34
+; AVX512VLDQ-NEXT:  ## %bb.33: ## %cond.store31
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $0, %xmm2, 16(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_34: ## %else32
+; AVX512VLDQ-NEXT:    kshiftrw $1, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_36
+; AVX512VLDQ-NEXT:  ## %bb.35: ## %cond.store33
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $1, %xmm2, 17(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_36: ## %else34
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $2, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_38
+; AVX512VLDQ-NEXT:  ## %bb.37: ## %cond.store35
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $2, %xmm2, 18(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_38: ## %else36
+; AVX512VLDQ-NEXT:    kshiftrw $3, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_40
+; AVX512VLDQ-NEXT:  ## %bb.39: ## %cond.store37
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $3, %xmm2, 19(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_40: ## %else38
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $4, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_42
+; AVX512VLDQ-NEXT:  ## %bb.41: ## %cond.store39
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $4, %xmm2, 20(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_42: ## %else40
+; AVX512VLDQ-NEXT:    kshiftrw $5, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_44
+; AVX512VLDQ-NEXT:  ## %bb.43: ## %cond.store41
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $5, %xmm2, 21(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_44: ## %else42
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $6, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_46
+; AVX512VLDQ-NEXT:  ## %bb.45: ## %cond.store43
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $6, %xmm2, 22(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_46: ## %else44
+; AVX512VLDQ-NEXT:    kshiftrw $7, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_48
+; AVX512VLDQ-NEXT:  ## %bb.47: ## %cond.store45
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $7, %xmm2, 23(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_48: ## %else46
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $8, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_50
+; AVX512VLDQ-NEXT:  ## %bb.49: ## %cond.store47
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $8, %xmm2, 24(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_50: ## %else48
+; AVX512VLDQ-NEXT:    kshiftrw $9, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_52
+; AVX512VLDQ-NEXT:  ## %bb.51: ## %cond.store49
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $9, %xmm2, 25(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_52: ## %else50
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $10, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_54
+; AVX512VLDQ-NEXT:  ## %bb.53: ## %cond.store51
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $10, %xmm2, 26(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_54: ## %else52
+; AVX512VLDQ-NEXT:    kshiftrw $11, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_56
+; AVX512VLDQ-NEXT:  ## %bb.55: ## %cond.store53
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $11, %xmm2, 27(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_56: ## %else54
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm2, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_58
+; AVX512VLDQ-NEXT:  ## %bb.57: ## %cond.store55
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $12, %xmm2, 28(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_58: ## %else56
+; AVX512VLDQ-NEXT:    kshiftrw $13, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_60
+; AVX512VLDQ-NEXT:  ## %bb.59: ## %cond.store57
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512VLDQ-NEXT:    vpextrb $13, %xmm2, 29(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_60: ## %else58
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512VLDQ-NEXT:    vpmovd2m %zmm0, %k0
+; AVX512VLDQ-NEXT:    kshiftrw $14, %k0, %k1
+; AVX512VLDQ-NEXT:    kmovw %k1, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_62
+; AVX512VLDQ-NEXT:  ## %bb.61: ## %cond.store59
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpextrb $14, %xmm0, 30(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_62: ## %else60
+; AVX512VLDQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512VLDQ-NEXT:    kmovw %k0, %eax
+; AVX512VLDQ-NEXT:    testb $1, %al
+; AVX512VLDQ-NEXT:    je LBB16_64
+; AVX512VLDQ-NEXT:  ## %bb.63: ## %cond.store61
+; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VLDQ-NEXT:    vpextrb $15, %xmm0, 31(%rdi)
+; AVX512VLDQ-NEXT:  LBB16_64: ## %else62
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: store_v32i8_v32i8:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vptestnmb %ymm0, %ymm0, %k1
@@ -4293,11 +5056,11 @@ define void @mstore_constmask_v4i32_v4i3
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: mstore_constmask_v4i32_v4i32:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    kxnorw %k0, %k0, %k1
-; AVX512VLBW-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: mstore_constmask_v4i32_v4i32:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512VL-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
   %mask = icmp eq <4 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
   ret void
@@ -4491,6 +5254,14 @@ define void @masked_store_bool_mask_dema
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovd2m %xmm1, %k1
+; AVX512VLDQ-NEXT:    vmovupd %ymm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    vzeroupper
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: masked_store_bool_mask_demand_trunc_sext:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -4606,11 +5377,11 @@ define void @one_mask_bit_set1_variable(
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: one_mask_bit_set1_variable:
-; AVX512VLBW:       ## %bb.0:
-; AVX512VLBW-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm1, %k1
-; AVX512VLBW-NEXT:    vmovups %xmm0, (%rdi) {%k1}
-; AVX512VLBW-NEXT:    retq
+; AVX512VL-LABEL: one_mask_bit_set1_variable:
+; AVX512VL:       ## %bb.0:
+; AVX512VL-NEXT:    vptestmd {{.*}}(%rip){1to4}, %xmm1, %k1
+; AVX512VL-NEXT:    vmovups %xmm0, (%rdi) {%k1}
+; AVX512VL-NEXT:    retq
   %mask_signbit = and <4 x i32> %mask, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
   %mask_bool = icmp ne <4 x i32> %mask_signbit, zeroinitializer
   call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 1, <4 x i1> %mask_bool)
@@ -4708,6 +5479,17 @@ define void @widen_masked_store(<3 x i32
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
+; AVX512VLDQ-LABEL: widen_masked_store:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512VLDQ-NEXT:    vpmovd2m %xmm1, %k0
+; AVX512VLDQ-NEXT:    vpmovm2d %k0, %xmm1
+; AVX512VLDQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512VLDQ-NEXT:    vpmovd2m %xmm1, %k1
+; AVX512VLDQ-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
 ; AVX512VLBW-LABEL: widen_masked_store:
 ; AVX512VLBW:       ## %bb.0:
 ; AVX512VLBW-NEXT:    vpslld $31, %xmm1, %xmm1




More information about the llvm-commits mailing list