[llvm] r312474 - [X86] Replace -mcpu option with -mattr in LIT tests added in https://reviews.llvm.org/rL312442

Ayman Musa via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 4 02:31:33 PDT 2017


Author: aymanmus
Date: Mon Sep  4 02:31:32 2017
New Revision: 312474

URL: http://llvm.org/viewvc/llvm-project?rev=312474&view=rev
Log:
[X86] Replace -mcpu option with -mattr in LIT tests added in https://reviews.llvm.org/rL312442

Modified:
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll
    llvm/trunk/test/CodeGen/X86/avx512-shuffles/unpack.ll

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-fp.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
 
 define <4 x double> @test_double_to_4(double %s) {
 ; CHECK-LABEL: test_double_to_4:
@@ -14,7 +14,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -28,7 +28,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -40,7 +40,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -54,7 +54,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -66,7 +66,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -80,7 +80,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -92,7 +92,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -106,7 +106,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -127,7 +127,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -141,7 +141,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -153,7 +153,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $103, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -167,7 +167,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $103, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -179,7 +179,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-56, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -193,7 +193,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-56, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -205,7 +205,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $78, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -219,7 +219,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $78, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x double> undef, double %s, i32 0
@@ -240,7 +240,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -254,7 +254,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -266,7 +266,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -280,7 +280,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -292,7 +292,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -306,7 +306,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -318,7 +318,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -332,7 +332,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -353,7 +353,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -367,7 +367,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -379,7 +379,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -393,7 +393,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -405,7 +405,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -419,7 +419,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -431,7 +431,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -445,7 +445,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -466,7 +466,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -480,7 +480,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -492,7 +492,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -506,7 +506,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -518,7 +518,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -532,7 +532,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -544,7 +544,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -558,7 +558,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = insertelement <2 x float> undef, float %s, i32 0
@@ -580,7 +580,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -594,7 +594,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -607,7 +607,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -621,7 +621,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -634,7 +634,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -648,7 +648,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -661,7 +661,7 @@ define <4 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_4_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -675,7 +675,7 @@ define <4 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_4_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -698,7 +698,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $120, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -712,7 +712,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $120, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -725,7 +725,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -739,7 +739,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -752,7 +752,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $111, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -766,7 +766,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $111, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -779,7 +779,7 @@ define <8 x double> @test_masked_double_
 ; CHECK-LABEL: test_masked_double_to_8_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-100, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -793,7 +793,7 @@ define <8 x double> @test_masked_z_doubl
 ; CHECK-LABEL: test_masked_z_double_to_8_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-100, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load double, double* %p
@@ -816,7 +816,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -830,7 +830,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -843,7 +843,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -857,7 +857,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -870,7 +870,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -884,7 +884,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -897,7 +897,7 @@ define <4 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_4_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -911,7 +911,7 @@ define <4 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_4_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -934,7 +934,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $67, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -948,7 +948,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $67, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -961,7 +961,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-51, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -975,7 +975,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-51, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -988,7 +988,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1002,7 +1002,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1015,7 +1015,7 @@ define <8 x float> @test_masked_float_to
 ; CHECK-LABEL: test_masked_float_to_8_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1029,7 +1029,7 @@ define <8 x float> @test_masked_z_float_
 ; CHECK-LABEL: test_masked_z_float_to_8_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1052,7 +1052,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-18370, %ax # imm = 0xB83E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1066,7 +1066,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-18370, %ax # imm = 0xB83E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1079,7 +1079,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26137, %ax # imm = 0x6619
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1093,7 +1093,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26137, %ax # imm = 0x6619
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1106,7 +1106,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1120,7 +1120,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1133,7 +1133,7 @@ define <16 x float> @test_masked_float_t
 ; CHECK-LABEL: test_masked_float_to_16_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p
@@ -1147,7 +1147,7 @@ define <16 x float> @test_masked_z_float
 ; CHECK-LABEL: test_masked_z_float_to_16_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %s = load float, float* %p

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-scalar-int.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
 
 define <16 x i8> @test_i8_to_16(i8 %s) {
 ; CHECK-LABEL: test_i8_to_16:

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-fp.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s
 
 define <8 x float> @test_2xfloat_to_8xfloat(<8 x float> %vec) {
 ; CHECK-LABEL: test_2xfloat_to_8xfloat:
@@ -13,7 +13,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -26,7 +26,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -37,7 +37,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -50,7 +50,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -61,7 +61,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -74,7 +74,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -85,7 +85,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK-LABEL: test_masked_2xfloat_to_8xfloat_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -98,7 +98,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK-LABEL: test_masked_z_2xfloat_to_8xfloat_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -117,7 +117,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $21312, %ax # imm = 0x5340
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -130,7 +130,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $21312, %ax # imm = 0x5340
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -141,7 +141,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8490, %ax # imm = 0xDED6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -154,7 +154,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8490, %ax # imm = 0xDED6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -165,7 +165,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12522, %ax # imm = 0x30EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -178,7 +178,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12522, %ax # imm = 0x30EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -189,7 +189,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK-LABEL: test_masked_2xfloat_to_16xfloat_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28344, %ax # imm = 0x9148
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -202,7 +202,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK-LABEL: test_masked_z_2xfloat_to_16xfloat_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28344, %ax # imm = 0x9148
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -222,7 +222,7 @@ define <4 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -235,7 +235,7 @@ define <4 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -247,7 +247,7 @@ define <4 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -260,7 +260,7 @@ define <4 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -272,7 +272,7 @@ define <4 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -285,7 +285,7 @@ define <4 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -297,7 +297,7 @@ define <4 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_4xdouble_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -310,7 +310,7 @@ define <4 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_4xdouble_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -331,7 +331,7 @@ define <8 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $21, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -344,7 +344,7 @@ define <8 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $21, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -356,7 +356,7 @@ define <8 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -369,7 +369,7 @@ define <8 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -381,7 +381,7 @@ define <8 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -394,7 +394,7 @@ define <8 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -406,7 +406,7 @@ define <8 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_to_8xdouble_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-19, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -419,7 +419,7 @@ define <8 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_to_8xdouble_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-19, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -440,7 +440,7 @@ define <8 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $28, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -453,7 +453,7 @@ define <8 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $28, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -465,7 +465,7 @@ define <8 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-115, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -478,7 +478,7 @@ define <8 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-115, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -490,7 +490,7 @@ define <8 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -503,7 +503,7 @@ define <8 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -515,7 +515,7 @@ define <8 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_to_8xdouble_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -528,7 +528,7 @@ define <8 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_to_8xdouble_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -551,7 +551,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movb $-49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -565,7 +565,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movb $-49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -578,7 +578,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movb $-118, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -592,7 +592,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movb $-118, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -605,7 +605,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movb $-11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -619,7 +619,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movb $-11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -632,7 +632,7 @@ define <8 x float> @test_masked_2xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movb $-102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -646,7 +646,7 @@ define <8 x float> @test_masked_z_2xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movb $-102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -669,7 +669,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -683,7 +683,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -696,7 +696,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -710,7 +710,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -723,7 +723,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -737,7 +737,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -750,7 +750,7 @@ define <16 x float> @test_masked_2xfloat
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -764,7 +764,7 @@ define <16 x float> @test_masked_z_2xflo
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x float>, <2 x float>* %vp
@@ -785,7 +785,7 @@ define <8 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-109, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -798,7 +798,7 @@ define <8 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-109, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -810,7 +810,7 @@ define <8 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $74, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -823,7 +823,7 @@ define <8 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $74, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -835,7 +835,7 @@ define <8 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -848,7 +848,7 @@ define <8 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -860,7 +860,7 @@ define <8 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_to_8xfloat_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $48, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -873,7 +873,7 @@ define <8 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_to_8xfloat_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $48, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -894,7 +894,7 @@ define <16 x float> @test_masked_4xfloat
 ; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-25378, %ax # imm = 0x9CDE
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -907,7 +907,7 @@ define <16 x float> @test_masked_z_4xflo
 ; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-25378, %ax # imm = 0x9CDE
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -919,7 +919,7 @@ define <16 x float> @test_masked_4xfloat
 ; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-22502, %ax # imm = 0xA81A
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -932,7 +932,7 @@ define <16 x float> @test_masked_z_4xflo
 ; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-22502, %ax # imm = 0xA81A
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -944,7 +944,7 @@ define <16 x float> @test_masked_4xfloat
 ; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $31229, %ax # imm = 0x79FD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -957,7 +957,7 @@ define <16 x float> @test_masked_z_4xflo
 ; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $31229, %ax # imm = 0x79FD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -969,7 +969,7 @@ define <16 x float> @test_masked_4xfloat
 ; CHECK-LABEL: test_masked_4xfloat_to_16xfloat_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5887, %ax # imm = 0x16FF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -982,7 +982,7 @@ define <16 x float> @test_masked_z_4xflo
 ; CHECK-LABEL: test_masked_z_4xfloat_to_16xfloat_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5887, %ax # imm = 0x16FF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -1003,7 +1003,7 @@ define <16 x float> @test_masked_8xfloat
 ; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-15887, %ax # imm = 0xC1F1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1016,7 +1016,7 @@ define <16 x float> @test_masked_z_8xflo
 ; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-15887, %ax # imm = 0xC1F1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1028,7 +1028,7 @@ define <16 x float> @test_masked_8xfloat
 ; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8077, %ax # imm = 0xE073
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1041,7 +1041,7 @@ define <16 x float> @test_masked_z_8xflo
 ; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8077, %ax # imm = 0xE073
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1053,7 +1053,7 @@ define <16 x float> @test_masked_8xfloat
 ; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5023, %ax # imm = 0xEC61
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1066,7 +1066,7 @@ define <16 x float> @test_masked_z_8xflo
 ; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5023, %ax # imm = 0xEC61
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1078,7 +1078,7 @@ define <16 x float> @test_masked_8xfloat
 ; CHECK-LABEL: test_masked_8xfloat_to_16xfloat_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-10326, %ax # imm = 0xD7AA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1091,7 +1091,7 @@ define <16 x float> @test_masked_z_8xflo
 ; CHECK-LABEL: test_masked_z_8xfloat_to_16xfloat_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-10326, %ax # imm = 0xD7AA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s
 
 ; FIXME: fixing PR34394 should fix the i32x2 memory cases resulting in a simple vbroadcasti32x2 instruction.
 
@@ -15,7 +15,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -28,7 +28,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -39,7 +39,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -52,7 +52,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -63,7 +63,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -76,7 +76,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -87,7 +87,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK-LABEL: test_masked_2xi32_to_4xi32_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -100,7 +100,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_4xi32_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -119,7 +119,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -132,7 +132,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -143,7 +143,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-15, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -156,7 +156,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-15, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -167,7 +167,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -180,7 +180,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -191,7 +191,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -204,7 +204,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -223,7 +223,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-18638, %ax # imm = 0xB732
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -236,7 +236,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-18638, %ax # imm = 0xB732
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -247,7 +247,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25429, %ax # imm = 0x6355
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -260,7 +260,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25429, %ax # imm = 0x6355
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -271,7 +271,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27159, %ax # imm = 0x6A17
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -284,7 +284,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27159, %ax # imm = 0x6A17
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -295,7 +295,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-LABEL: test_masked_2xi32_to_16xi32_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-22884, %ax # imm = 0xA69C
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -308,7 +308,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-LABEL: test_masked_z_2xi32_to_16xi32_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-22884, %ax # imm = 0xA69C
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} zmm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
@@ -330,7 +330,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -344,7 +344,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -357,7 +357,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -371,7 +371,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -384,7 +384,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -398,7 +398,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -411,7 +411,7 @@ define <4 x i32> @test_masked_2xi32_to_4
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = xmm1[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -425,7 +425,7 @@ define <4 x i32> @test_masked_z_2xi32_to
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[0,2,0,2]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -450,7 +450,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-NEXT:    movb $-94, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -465,7 +465,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    movb $-94, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -479,7 +479,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-NEXT:    movb $97, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -494,7 +494,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    movb $97, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -508,7 +508,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-NEXT:    movb $-33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -523,7 +523,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    movb $-33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -537,7 +537,7 @@ define <8 x i32> @test_masked_2xi32_to_8
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-NEXT:    movb $-111, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -552,7 +552,7 @@ define <8 x i32> @test_masked_z_2xi32_to
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; CHECK-NEXT:    movb $-111, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -577,7 +577,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -592,7 +592,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -606,7 +606,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -621,7 +621,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -635,7 +635,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $-19542, %ax # imm = 0xB3AA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -650,7 +650,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $-19542, %ax # imm = 0xB3AA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -664,7 +664,7 @@ define <16 x i32> @test_masked_2xi32_to_
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $27409, %ax # imm = 0x6B11
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm1, %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -679,7 +679,7 @@ define <16 x i32> @test_masked_z_2xi32_t
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
 ; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2]
 ; CHECK-NEXT:    movw $27409, %ax # imm = 0x6B11
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <2 x i32>, <2 x i32>* %vp
@@ -700,7 +700,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-87, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -713,7 +713,7 @@ define <8 x i32> @test_masked_z_4xi32_to
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-87, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -725,7 +725,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -738,7 +738,7 @@ define <8 x i32> @test_masked_z_4xi32_to
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -750,7 +750,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -763,7 +763,7 @@ define <8 x i32> @test_masked_z_4xi32_to
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -775,7 +775,7 @@ define <8 x i32> @test_masked_4xi32_to_8
 ; CHECK-LABEL: test_masked_4xi32_to_8xi32_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -788,7 +788,7 @@ define <8 x i32> @test_masked_z_4xi32_to
 ; CHECK-LABEL: test_masked_z_4xi32_to_8xi32_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -809,7 +809,7 @@ define <16 x i32> @test_masked_4xi32_to_
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $10334, %ax # imm = 0x285E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -822,7 +822,7 @@ define <16 x i32> @test_masked_z_4xi32_t
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $10334, %ax # imm = 0x285E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -834,7 +834,7 @@ define <16 x i32> @test_masked_4xi32_to_
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30962, %ax # imm = 0x870E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -847,7 +847,7 @@ define <16 x i32> @test_masked_z_4xi32_t
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30962, %ax # imm = 0x870E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -859,7 +859,7 @@ define <16 x i32> @test_masked_4xi32_to_
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $31933, %ax # imm = 0x7CBD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -872,7 +872,7 @@ define <16 x i32> @test_masked_z_4xi32_t
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $31933, %ax # imm = 0x7CBD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -884,7 +884,7 @@ define <16 x i32> @test_masked_4xi32_to_
 ; CHECK-LABEL: test_masked_4xi32_to_16xi32_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28744, %ax # imm = 0x8FB8
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -897,7 +897,7 @@ define <16 x i32> @test_masked_z_4xi32_t
 ; CHECK-LABEL: test_masked_z_4xi32_to_16xi32_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28744, %ax # imm = 0x8FB8
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i32>, <4 x i32>* %vp
@@ -918,7 +918,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -931,7 +931,7 @@ define <4 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -943,7 +943,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -956,7 +956,7 @@ define <4 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -968,7 +968,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -981,7 +981,7 @@ define <4 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -993,7 +993,7 @@ define <4 x i64> @test_masked_2xi64_to_4
 ; CHECK-LABEL: test_masked_2xi64_to_4xi64_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1006,7 +1006,7 @@ define <4 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_4xi64_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1027,7 +1027,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $119, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1040,7 +1040,7 @@ define <8 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $119, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1052,7 +1052,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1065,7 +1065,7 @@ define <8 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1077,7 +1077,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1090,7 +1090,7 @@ define <8 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-33, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1102,7 +1102,7 @@ define <8 x i64> @test_masked_2xi64_to_8
 ; CHECK-LABEL: test_masked_2xi64_to_8xi64_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1115,7 +1115,7 @@ define <8 x i64> @test_masked_z_2xi64_to
 ; CHECK-LABEL: test_masked_z_2xi64_to_8xi64_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec = load <2 x i64>, <2 x i64>* %vp
@@ -1136,7 +1136,7 @@ define <16 x i32> @test_masked_8xi32_to_
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12321, %ax # imm = 0x3021
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1149,7 +1149,7 @@ define <16 x i32> @test_masked_z_8xi32_t
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12321, %ax # imm = 0x3021
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1161,7 +1161,7 @@ define <16 x i32> @test_masked_8xi32_to_
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-39, %ax
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1174,7 +1174,7 @@ define <16 x i32> @test_masked_z_8xi32_t
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-39, %ax
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1186,7 +1186,7 @@ define <16 x i32> @test_masked_8xi32_to_
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-24047, %ax # imm = 0xA211
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1199,7 +1199,7 @@ define <16 x i32> @test_masked_z_8xi32_t
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-24047, %ax # imm = 0xA211
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1211,7 +1211,7 @@ define <16 x i32> @test_masked_8xi32_to_
 ; CHECK-LABEL: test_masked_8xi32_to_16xi32_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5470, %ax # imm = 0x155E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1224,7 +1224,7 @@ define <16 x i32> @test_masked_z_8xi32_t
 ; CHECK-LABEL: test_masked_z_8xi32_to_16xi32_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5470, %ax # imm = 0x155E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %vp
@@ -1245,7 +1245,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-71, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1258,7 +1258,7 @@ define <8 x i64> @test_masked_z_4xi64_to
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-71, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1270,7 +1270,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1283,7 +1283,7 @@ define <8 x i64> @test_masked_z_4xi64_to
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1295,7 +1295,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $103, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1308,7 +1308,7 @@ define <8 x i64> @test_masked_z_4xi64_to
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $103, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1320,7 +1320,7 @@ define <8 x i64> @test_masked_4xi64_to_8
 ; CHECK-LABEL: test_masked_4xi64_to_8xi64_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-83, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp
@@ -1333,7 +1333,7 @@ define <8 x i64> @test_masked_z_4xi64_to
 ; CHECK-LABEL: test_masked_z_4xi64_to_8xi64_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-83, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %vp

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-high.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
 
 define <4 x float> @test_4xfloat_dup_high(<4 x float> %vec) {
 ; CHECK-LABEL: test_4xfloat_dup_high:
@@ -13,7 +13,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -26,7 +26,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -37,7 +37,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -50,7 +50,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -61,7 +61,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -74,7 +74,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -85,7 +85,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -98,7 +98,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -109,7 +109,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 {%k1} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -122,7 +122,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@@ -142,7 +142,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -155,7 +155,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -167,7 +167,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -180,7 +180,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -192,7 +192,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -205,7 +205,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -217,7 +217,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -230,7 +230,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -242,7 +242,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -255,7 +255,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 {%k1} {z} = mem[1,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -275,7 +275,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-106, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -288,7 +288,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-106, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -299,7 +299,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -312,7 +312,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -323,7 +323,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -336,7 +336,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -347,7 +347,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -360,7 +360,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -371,7 +371,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-109, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -384,7 +384,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-109, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@@ -404,7 +404,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $74, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -417,7 +417,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $74, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -429,7 +429,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -442,7 +442,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -454,7 +454,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $48, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -467,7 +467,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $48, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -479,7 +479,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-100, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -492,7 +492,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-100, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -504,7 +504,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -517,7 +517,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -537,7 +537,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $31229, %ax # imm = 0x79FD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -550,7 +550,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $31229, %ax # imm = 0x79FD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -561,7 +561,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5887, %ax # imm = 0x16FF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -574,7 +574,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5887, %ax # imm = 0x16FF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -585,7 +585,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-15887, %ax # imm = 0xC1F1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -598,7 +598,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-15887, %ax # imm = 0xC1F1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -609,7 +609,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8077, %ax # imm = 0xE073
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -622,7 +622,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8077, %ax # imm = 0xE073
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -633,7 +633,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5023, %ax # imm = 0xEC61
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -646,7 +646,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5023, %ax # imm = 0xEC61
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
@@ -666,7 +666,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-10326, %ax # imm = 0xD7AA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -679,7 +679,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-10326, %ax # imm = 0xD7AA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -691,7 +691,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6675, %ax # imm = 0xE5ED
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -704,7 +704,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6675, %ax # imm = 0xE5ED
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -716,7 +716,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5042, %ax # imm = 0xEC4E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -729,7 +729,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5042, %ax # imm = 0xEC4E
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -741,7 +741,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30108, %ax # imm = 0x8A64
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -754,7 +754,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30108, %ax # imm = 0x8A64
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -766,7 +766,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25644, %ax # imm = 0x642C
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -779,7 +779,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25644, %ax # imm = 0x642C
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/duplicate-low.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
 
 define <2 x double> @test_2xdouble_dup_low(<2 x double> %vec) {
 ; CHECK-LABEL: test_2xdouble_dup_low:
@@ -13,7 +13,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -26,7 +26,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
@@ -37,7 +37,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -50,7 +50,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
@@ -70,7 +70,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -83,7 +83,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -95,7 +95,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -108,7 +108,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -128,7 +128,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -141,7 +141,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -152,7 +152,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -165,7 +165,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -176,7 +176,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -189,7 +189,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -200,7 +200,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -213,7 +213,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -224,7 +224,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -237,7 +237,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -257,7 +257,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -270,7 +270,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -282,7 +282,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -295,7 +295,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -307,7 +307,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -320,7 +320,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -332,7 +332,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -345,7 +345,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -357,7 +357,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -370,7 +370,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -390,7 +390,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -403,7 +403,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -414,7 +414,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -427,7 +427,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -438,7 +438,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -451,7 +451,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -462,7 +462,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -475,7 +475,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -486,7 +486,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -499,7 +499,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -519,7 +519,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -532,7 +532,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -544,7 +544,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $79, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -557,7 +557,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $79, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -569,7 +569,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -582,7 +582,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -594,7 +594,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-27, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -607,7 +607,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-27, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -619,7 +619,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -632,7 +632,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -652,7 +652,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -665,7 +665,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -676,7 +676,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -689,7 +689,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -700,7 +700,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -713,7 +713,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -724,7 +724,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -737,7 +737,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -748,7 +748,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -761,7 +761,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
@@ -781,7 +781,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -794,7 +794,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -806,7 +806,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -819,7 +819,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -831,7 +831,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -844,7 +844,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -856,7 +856,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -869,7 +869,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -881,7 +881,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -894,7 +894,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -914,7 +914,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -927,7 +927,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -938,7 +938,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -951,7 +951,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -962,7 +962,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-73, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -975,7 +975,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-73, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -986,7 +986,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -999,7 +999,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -1010,7 +1010,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-46, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1023,7 +1023,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-46, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
@@ -1043,7 +1043,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1056,7 +1056,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1068,7 +1068,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1081,7 +1081,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1093,7 +1093,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1106,7 +1106,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1118,7 +1118,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1131,7 +1131,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1143,7 +1143,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1156,7 +1156,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -1176,7 +1176,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $21312, %ax # imm = 0x5340
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1189,7 +1189,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $21312, %ax # imm = 0x5340
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -1200,7 +1200,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8490, %ax # imm = 0xDED6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1213,7 +1213,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8490, %ax # imm = 0xDED6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -1224,7 +1224,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12522, %ax # imm = 0x30EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1237,7 +1237,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12522, %ax # imm = 0x30EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -1248,7 +1248,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28344, %ax # imm = 0x9148
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1261,7 +1261,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28344, %ax # imm = 0x9148
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -1272,7 +1272,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15638, %ax # imm = 0x3D16
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1285,7 +1285,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15638, %ax # imm = 0x3D16
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
@@ -1305,7 +1305,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-2129, %ax # imm = 0xF7AF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1318,7 +1318,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-2129, %ax # imm = 0xF7AF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1330,7 +1330,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12900, %ax # imm = 0xCD9C
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1343,7 +1343,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12900, %ax # imm = 0xCD9C
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1355,7 +1355,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $29358, %ax # imm = 0x72AE
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1368,7 +1368,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $29358, %ax # imm = 0x72AE
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1380,7 +1380,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5272, %ax # imm = 0x1498
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1393,7 +1393,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $5272, %ax # imm = 0x1498
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1405,7 +1405,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $20975, %ax # imm = 0x51EF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1418,7 +1418,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $20975, %ax # imm = 0x51EF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
 
 ; FIXME: The non immediate <16 x float> test cases should be fixed by PR34382
 
@@ -15,7 +15,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 {%k1} = xmm0[2,1,3,1]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -28,7 +28,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,3,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 1>
@@ -39,7 +39,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -52,7 +52,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
@@ -63,7 +63,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,3,2,1]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -76,7 +76,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 1>
@@ -95,7 +95,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 {%k1} = xmm0[1,2,3,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -108,7 +108,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 2>
@@ -128,7 +128,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = mem[3,3,1,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -141,7 +141,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[3,3,1,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -154,7 +154,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = mem[1,3,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -167,7 +167,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[1,3,2,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -180,7 +180,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = mem[2,1,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -193,7 +193,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -215,7 +215,7 @@ define <4 x float> @test_masked_4xfloat_
 ; CHECK-LABEL: test_masked_4xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} = mem[0,1,3,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -228,7 +228,7 @@ define <4 x float> @test_masked_z_4xfloa
 ; CHECK-LABEL: test_masked_z_4xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,0]
 ; CHECK-NEXT:    retq
   %vec = load <4 x float>, <4 x float>* %vp
@@ -249,7 +249,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $83, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,6,6,6]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -262,7 +262,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $83, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,6,6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 6, i32 6>
@@ -273,7 +273,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-34, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,7,6,7,6]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -286,7 +286,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-34, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,7,6,7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 7, i32 6, i32 7, i32 6>
@@ -297,7 +297,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,1,6,5,4,4]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -310,7 +310,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,1,6,5,4,4]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 2, i32 1, i32 6, i32 5, i32 4, i32 4>
@@ -329,7 +329,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-111, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,2,1,0,6,6,5,4]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -342,7 +342,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-111, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,1,0,6,6,5,4]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 2, i32 1, i32 0, i32 6, i32 6, i32 5, i32 4>
@@ -353,7 +353,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $61, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,3,7,7,6,5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -366,7 +366,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $61, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,3,7,7,6,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 6, i32 5>
@@ -377,7 +377,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3,6,5,7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -390,7 +390,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3,6,5,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 3, i32 6, i32 5, i32 7, i32 7>
@@ -409,7 +409,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-51, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,5,6,7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -422,7 +422,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-51, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,5,6,7,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 2, i32 5, i32 6, i32 7, i32 7>
@@ -433,7 +433,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 {%k1} = ymm0[3,0,2,1,7,4,6,5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -446,7 +446,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,2,1,7,4,6,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 0, i32 2, i32 1, i32 7, i32 4, i32 6, i32 5>
@@ -468,7 +468,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    movb $-95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,2,4,6,7,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -482,7 +482,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
 ; CHECK-NEXT:    movb $-95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,2,4,6,7,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -495,7 +495,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = mem[2,0,2,2,6,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -508,7 +508,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[2,0,2,2,6,4,6,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -522,7 +522,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = ymm1[2,1,1,3,4,4,7,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -536,7 +536,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,1,3,4,4,7,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -558,7 +558,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = mem[0,0,3,3,4,4,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -571,7 +571,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[0,0,3,3,4,4,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -585,7 +585,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    movb $30, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = ymm1[0,1,0,1,4,6,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -599,7 +599,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
 ; CHECK-NEXT:    movb $30, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,0,1,4,6,5,4]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -612,7 +612,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $56, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = mem[2,0,0,3,6,4,4,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -625,7 +625,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $56, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[2,0,0,3,6,4,4,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -649,7 +649,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    movb $-54, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = ymm1[0,1,2,3,7,4,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -663,7 +663,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
 ; CHECK-NEXT:    movb $-54, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -676,7 +676,7 @@ define <8 x float> @test_masked_8xfloat_
 ; CHECK-LABEL: test_masked_8xfloat_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $85, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} = mem[0,2,3,1,4,6,7,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -689,7 +689,7 @@ define <8 x float> @test_masked_z_8xfloa
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $85, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,1,4,6,7,5]
 ; CHECK-NEXT:    retq
   %vec = load <8 x float>, <8 x float>* %vp
@@ -712,7 +712,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [1,1,3,1,6,4,6,5,8,9,8,11,13,13,13,15]
 ; CHECK-NEXT:    movw $16429, %ax # imm = 0x402D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -726,7 +726,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [1,1,3,1,6,4,6,5,8,9,8,11,13,13,13,15]
 ; CHECK-NEXT:    movw $16429, %ax # imm = 0x402D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 1, i32 6, i32 4, i32 6, i32 5, i32 8, i32 9, i32 8, i32 11, i32 13, i32 13, i32 13, i32 15>
@@ -737,7 +737,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26425, %ax # imm = 0x98C7
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -750,7 +750,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26425, %ax # imm = 0x98C7
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,2,2,1,6,6,6,5,10,10,10,9,14,14,14,13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 2, i32 2, i32 1, i32 6, i32 6, i32 6, i32 5, i32 10, i32 10, i32 10, i32 9, i32 14, i32 14, i32 14, i32 13>
@@ -762,7 +762,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [1,2,0,0,5,4,6,5,11,10,9,9,14,13,14,12]
 ; CHECK-NEXT:    movw $28987, %ax # imm = 0x713B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -776,7 +776,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [1,2,0,0,5,4,6,5,11,10,9,9,14,13,14,12]
 ; CHECK-NEXT:    movw $28987, %ax # imm = 0x713B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 0, i32 0, i32 5, i32 4, i32 6, i32 5, i32 11, i32 10, i32 9, i32 9, i32 14, i32 13, i32 14, i32 12>
@@ -795,7 +795,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $11457, %ax # imm = 0x2CC1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -808,7 +808,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $11457, %ax # imm = 0x2CC1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 0, i32 2, i32 5, i32 5, i32 4, i32 6, i32 9, i32 9, i32 8, i32 10, i32 13, i32 13, i32 12, i32 14>
@@ -820,7 +820,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [1,2,3,3,5,5,5,7,11,11,8,11,14,12,14,15]
 ; CHECK-NEXT:    movw $30908, %ax # imm = 0x78BC
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -834,7 +834,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [1,2,3,3,5,5,5,7,11,11,8,11,14,12,14,15]
 ; CHECK-NEXT:    movw $30908, %ax # imm = 0x78BC
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 3, i32 3, i32 5, i32 5, i32 5, i32 7, i32 11, i32 11, i32 8, i32 11, i32 14, i32 12, i32 14, i32 15>
@@ -845,7 +845,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26863, %ax # imm = 0x68EF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -858,7 +858,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26863, %ax # imm = 0x68EF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2,1,0,5,6,5,4,9,10,9,8,13,14,13,12]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 1, i32 2, i32 1, i32 0, i32 5, i32 6, i32 5, i32 4, i32 9, i32 10, i32 9, i32 8, i32 13, i32 14, i32 13, i32 12>
@@ -879,7 +879,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [2,0,3,2,4,4,6,7,9,11,8,11,13,12,13,13]
 ; CHECK-NEXT:    movw $-28239, %ax # imm = 0x91B1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -893,7 +893,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,0,3,2,4,4,6,7,9,11,8,11,13,12,13,13]
 ; CHECK-NEXT:    movw $-28239, %ax # imm = 0x91B1
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 2, i32 4, i32 4, i32 6, i32 7, i32 9, i32 11, i32 8, i32 11, i32 13, i32 12, i32 13, i32 13>
@@ -904,7 +904,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-32205, %ax # imm = 0x8233
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -917,7 +917,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-32205, %ax # imm = 0x8233
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,0,2,7,7,4,6,11,11,8,10,15,15,12,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 3, i32 3, i32 0, i32 2, i32 7, i32 7, i32 4, i32 6, i32 11, i32 11, i32 8, i32 10, i32 15, i32 15, i32 12, i32 14>
@@ -939,7 +939,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12]
 ; CHECK-NEXT:    movw $-22887, %ax # imm = 0xA699
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -953,7 +953,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [3,3,3,0,6,6,6,6,11,10,9,10,12,14,12,12]
 ; CHECK-NEXT:    movw $-22887, %ax # imm = 0xA699
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -966,7 +966,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $22744, %ax # imm = 0x58D8
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[1,3,2,1,5,7,6,5,9,11,10,9,13,15,14,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -979,7 +979,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $22744, %ax # imm = 0x58D8
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[1,3,2,1,5,7,6,5,9,11,10,9,13,15,14,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -993,7 +993,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,0,0,3,5,5,6,5,9,8,8,8,14,12,13,13]
 ; CHECK-NEXT:    movw $-8399, %ax # imm = 0xDF31
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1007,7 +1007,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,0,0,3,5,5,6,5,9,8,8,8,14,12,13,13]
 ; CHECK-NEXT:    movw $-8399, %ax # imm = 0xDF31
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1029,7 +1029,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18246, %ax # imm = 0x4746
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[1,0,3,1,5,4,7,5,9,8,11,9,13,12,15,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1042,7 +1042,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18246, %ax # imm = 0x4746
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[1,0,3,1,5,4,7,5,9,8,11,9,13,12,15,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1056,7 +1056,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [3,3,1,1,6,5,5,6,11,11,10,9,15,14,12,12]
 ; CHECK-NEXT:    movw $1218, %ax # imm = 0x4C2
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1070,7 +1070,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [3,3,1,1,6,5,5,6,11,11,10,9,15,14,12,12]
 ; CHECK-NEXT:    movw $1218, %ax # imm = 0x4C2
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1083,7 +1083,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $2665, %ax # imm = 0xA69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[2,0,0,1,6,4,4,5,10,8,8,9,14,12,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1096,7 +1096,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $2665, %ax # imm = 0xA69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[2,0,0,1,6,4,4,5,10,8,8,9,14,12,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1120,7 +1120,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15]
 ; CHECK-NEXT:    movw $-20907, %ax # imm = 0xAE55
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1134,7 +1134,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,1,1,2,6,5,5,7,9,11,9,9,12,15,14,15]
 ; CHECK-NEXT:    movw $-20907, %ax # imm = 0xAE55
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1147,7 +1147,7 @@ define <16 x float> @test_masked_16xfloa
 ; CHECK-LABEL: test_masked_16xfloat_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28944, %ax # imm = 0x8EF0
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} = mem[1,2,0,1,5,6,4,5,9,10,8,9,13,14,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1160,7 +1160,7 @@ define <16 x float> @test_masked_z_16xfl
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28944, %ax # imm = 0x8EF0
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[1,2,0,1,5,6,4,5,9,10,8,9,13,14,12,13]
 ; CHECK-NEXT:    retq
   %vec = load <16 x float>, <16 x float>* %vp
@@ -1181,7 +1181,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm1 {%k1} = xmm0[1,0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -1194,7 +1194,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -1205,7 +1205,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm1 {%k1} = xmm0[1,0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -1218,7 +1218,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 1, i32 0>
@@ -1238,7 +1238,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 {%k1} = mem[1,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -1251,7 +1251,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 {%k1} {z} = mem[1,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -1264,7 +1264,7 @@ define <2 x double> @test_masked_2xdoubl
 ; CHECK-LABEL: test_masked_2xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 {%k1} = mem[1,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -1277,7 +1277,7 @@ define <2 x double> @test_masked_z_2xdou
 ; CHECK-LABEL: test_masked_z_2xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 {%k1} {z} = mem[1,0]
 ; CHECK-NEXT:    retq
   %vec = load <2 x double>, <2 x double>* %vp
@@ -1298,7 +1298,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,0,2,3]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1311,7 +1311,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
@@ -1322,7 +1322,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1335,7 +1335,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
@@ -1346,7 +1346,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,3,3]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1359,7 +1359,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
@@ -1378,7 +1378,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm1 {%k1} = ymm0[1,1,2,2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1391,7 +1391,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,2,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
@@ -1411,7 +1411,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} = mem[0,1,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1424,7 +1424,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1437,7 +1437,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} = mem[0,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1450,7 +1450,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[0,1,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1463,7 +1463,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} = mem[1,0,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1476,7 +1476,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,3,3]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1498,7 +1498,7 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} = mem[1,0,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1511,7 +1511,7 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,3,2]
 ; CHECK-NEXT:    retq
   %vec = load <4 x double>, <4 x double>* %vp
@@ -1532,7 +1532,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,3,2,4,5,7,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1545,7 +1545,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,3,2,4,5,7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 2, i32 4, i32 5, i32 7, i32 6>
@@ -1556,7 +1556,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-39, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,7,6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1569,7 +1569,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-39, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 7, i32 6>
@@ -1580,7 +1580,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-53, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,3,5,5,6,7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1593,7 +1593,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-53, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,3,5,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 3, i32 5, i32 5, i32 6, i32 7>
@@ -1612,7 +1612,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,2,4,4,6,7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1625,7 +1625,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 7>
@@ -1645,7 +1645,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,5,4,7,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -1658,7 +1658,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,5,4,7,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -1671,7 +1671,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $27, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = mem[0,1,3,3,4,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -1684,7 +1684,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $27, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[0,1,3,3,4,5,7,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -1697,7 +1697,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,5,4,7,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -1710,7 +1710,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,5,4,7,6]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -1732,7 +1732,7 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} = mem[1,0,3,2,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp
@@ -1745,7 +1745,7 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $89, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,3,2,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec = load <8 x double>, <8 x double>* %vp

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
 
 ; FIXME: All cases here should be fixed by PR34380
 
@@ -4146,10 +4146,10 @@ define <2 x double> @test_masked_8xdoubl
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm2
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm1 {%k1}
-; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>
@@ -4162,9 +4162,10 @@ define <2 x double> @test_masked_z_8xdou
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovapd %xmm0, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/permute.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
 
 define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
 ; CHECK-LABEL: test_16xi16_perm_mask0:

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-interleave.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
 
 define <4 x float> @test_4xfloat_shuff_mask0(<4 x float> %vec1, <4 x float> %vec2) {
 ; CHECK-LABEL: test_4xfloat_shuff_mask0:
@@ -13,7 +13,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[2,1],xmm1[3,1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -26,7 +26,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],xmm1[3,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 1, i32 7, i32 5>
@@ -37,7 +37,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,2],xmm1[3,2]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -50,7 +50,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2],xmm1[3,2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 2, i32 7, i32 6>
@@ -61,7 +61,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[1,3],xmm1[2,1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -74,7 +74,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],xmm1[2,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 1, i32 3, i32 6, i32 5>
@@ -93,7 +93,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm2 {%k1} = xmm0[3,3],xmm1[3,3]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -106,7 +106,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],xmm1[3,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 3, i32 3, i32 7, i32 7>
@@ -126,7 +126,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,0],mem[1,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -140,7 +140,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0],mem[1,2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -153,7 +153,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[3,3],mem[1,3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -167,7 +167,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3],mem[1,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -180,7 +180,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[1,3],mem[2,0]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -194,7 +194,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3],mem[2,0]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -216,7 +216,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm1 {%k1} = xmm0[2,1],mem[3,2]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -230,7 +230,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1],mem[3,2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -251,7 +251,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -264,7 +264,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3],ymm1[0,2],ymm0[5,7],ymm1[4,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 1, i32 3, i32 8, i32 10, i32 5, i32 7, i32 12, i32 14>
@@ -275,7 +275,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -288,7 +288,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3],ymm1[3,1],ymm0[4,7],ymm1[7,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 3, i32 11, i32 9, i32 4, i32 7, i32 15, i32 13>
@@ -299,7 +299,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -312,7 +312,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2],ymm1[2,2],ymm0[4,6],ymm1[6,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 2, i32 10, i32 10, i32 4, i32 6, i32 14, i32 14>
@@ -331,7 +331,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm2 {%k1} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -344,7 +344,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2],ymm1[3,2],ymm0[7,6],ymm1[7,6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 3, i32 2, i32 11, i32 10, i32 7, i32 6, i32 15, i32 14>
@@ -364,7 +364,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-106, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -378,7 +378,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-106, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1],mem[0,0],ymm0[6,5],mem[4,4]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -391,7 +391,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -405,7 +405,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $114, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2],mem[1,0],ymm0[6,6],mem[5,4]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -418,7 +418,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -432,7 +432,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -454,7 +454,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm1 {%k1} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -468,7 +468,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3],mem[2,1],ymm0[7,7],mem[6,5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -489,7 +489,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-19315, %ax # imm = 0xB48D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -502,7 +502,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-19315, %ax # imm = 0xB48D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2],zmm1[3,2],zmm0[7,6],zmm1[7,6],zmm0[11,10],zmm1[11,10],zmm0[15,14],zmm1[15,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 2, i32 19, i32 18, i32 7, i32 6, i32 23, i32 22, i32 11, i32 10, i32 27, i32 26, i32 15, i32 14, i32 31, i32 30>
@@ -513,7 +513,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18064, %ax # imm = 0x4690
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -526,7 +526,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18064, %ax # imm = 0x4690
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[1,2],zmm1[3,3],zmm0[5,6],zmm1[7,7],zmm0[9,10],zmm1[11,11],zmm0[13,14],zmm1[15,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 1, i32 2, i32 19, i32 19, i32 5, i32 6, i32 23, i32 23, i32 9, i32 10, i32 27, i32 27, i32 13, i32 14, i32 31, i32 31>
@@ -537,7 +537,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12346, %ax # imm = 0xCFC6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -550,7 +550,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12346, %ax # imm = 0xCFC6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],zmm1[2,1],zmm0[7,4],zmm1[6,5],zmm0[11,8],zmm1[10,9],zmm0[15,12],zmm1[14,13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 3, i32 0, i32 18, i32 17, i32 7, i32 4, i32 22, i32 21, i32 11, i32 8, i32 26, i32 25, i32 15, i32 12, i32 30, i32 29>
@@ -569,7 +569,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-9865, %ax # imm = 0xD977
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -582,7 +582,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-9865, %ax # imm = 0xD977
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3],zmm1[0,2],zmm0[6,7],zmm1[4,6],zmm0[10,11],zmm1[8,10],zmm0[14,15],zmm1[12,14]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 3, i32 16, i32 18, i32 6, i32 7, i32 20, i32 22, i32 10, i32 11, i32 24, i32 26, i32 14, i32 15, i32 28, i32 30>
@@ -602,7 +602,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $7677, %ax # imm = 0x1DFD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -616,7 +616,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $7677, %ax # imm = 0x1DFD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0],mem[0,2],zmm0[7,4],mem[4,6],zmm0[11,8],mem[8,10],zmm0[15,12],mem[12,14]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -629,7 +629,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $14448, %ax # imm = 0x3870
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -643,7 +643,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $14448, %ax # imm = 0x3870
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[0,2],mem[3,2],zmm0[4,6],mem[7,6],zmm0[8,10],mem[11,10],zmm0[12,14],mem[15,14]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -656,7 +656,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-13463, %ax # imm = 0xCB69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -670,7 +670,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-13463, %ax # imm = 0xCB69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0],mem[2,2],zmm0[6,4],mem[6,6],zmm0[10,8],mem[10,10],zmm0[14,12],mem[14,14]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -692,7 +692,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $21793, %ax # imm = 0x5521
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm1 {%k1} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -706,7 +706,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $21793, %ax # imm = 0x5521
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1],mem[1,3],zmm0[6,5],mem[5,7],zmm0[10,9],mem[9,11],zmm0[14,13],mem[13,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -727,7 +727,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -740,7 +740,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
@@ -751,7 +751,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -764,7 +764,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 2>
@@ -784,7 +784,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -798,7 +798,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -811,7 +811,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -825,7 +825,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -846,7 +846,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -859,7 +859,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 7>
@@ -870,7 +870,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -883,7 +883,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 3, i32 6>
@@ -894,7 +894,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -907,7 +907,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[0],ymm0[3],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
@@ -926,7 +926,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -939,7 +939,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
@@ -959,7 +959,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -973,7 +973,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -986,7 +986,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1000,7 +1000,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -1013,7 +1013,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[3],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1027,7 +1027,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[3],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -1049,7 +1049,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1063,7 +1063,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -1084,7 +1084,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-77, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1097,7 +1097,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-77, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[4],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 4, i32 13, i32 7, i32 15>
@@ -1108,7 +1108,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1121,7 +1121,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[3],zmm0[5],zmm1[5],zmm0[6],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 11, i32 5, i32 13, i32 6, i32 15>
@@ -1132,7 +1132,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-87, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1145,7 +1145,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-87, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[5],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 13, i32 6, i32 14>
@@ -1164,7 +1164,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1177,7 +1177,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[0],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 8, i32 3, i32 11, i32 4, i32 12, i32 7, i32 15>
@@ -1197,7 +1197,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1211,7 +1211,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[5],mem[5],zmm0[6],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -1224,7 +1224,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1238,7 +1238,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[0],zmm0[3],mem[2],zmm0[4],mem[4],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -1251,7 +1251,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1265,7 +1265,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[2],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -1287,7 +1287,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-39, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1301,7 +1301,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-39, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[2],mem[3],zmm0[4],mem[5],zmm0[6],mem[6]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle-vec.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
 
 ; FIXME: 128-bit shuffles of 256-bit vectors cases should be fixed by PR34359
 
@@ -16,7 +16,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -29,7 +29,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $-41, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -41,7 +41,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $-63, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -54,7 +54,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $-63, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -66,7 +66,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -79,7 +79,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -99,7 +99,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -112,7 +112,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $66, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -133,7 +133,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -147,7 +147,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -161,7 +161,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -175,7 +175,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -189,7 +189,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -203,7 +203,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $-50, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -226,7 +226,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $-26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -240,7 +240,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $-26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -261,7 +261,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -274,7 +274,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -285,7 +285,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -298,7 +298,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
@@ -309,7 +309,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $75, %ax
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -322,7 +322,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $75, %ax
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
@@ -341,7 +341,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -354,7 +354,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
@@ -374,7 +374,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -388,7 +388,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -401,7 +401,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -415,7 +415,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -428,7 +428,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -442,7 +442,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -464,7 +464,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -478,7 +478,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -500,7 +500,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -513,7 +513,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -525,7 +525,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -538,7 +538,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -550,7 +550,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -563,7 +563,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -583,7 +583,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -596,7 +596,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -617,7 +617,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -631,7 +631,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -645,7 +645,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -659,7 +659,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -673,7 +673,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -687,7 +687,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -710,7 +710,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -724,7 +724,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -745,7 +745,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -758,7 +758,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
@@ -769,7 +769,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -782,7 +782,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
@@ -793,7 +793,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $30, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -806,7 +806,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $30, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
@@ -825,7 +825,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $56, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -838,7 +838,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $56, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
@@ -858,7 +858,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -872,7 +872,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $95, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -885,7 +885,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -899,7 +899,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -912,7 +912,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -926,7 +926,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -948,7 +948,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -962,7 +962,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -984,7 +984,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -997,7 +997,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $26, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1009,7 +1009,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1022,7 +1022,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $-4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1034,7 +1034,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $51, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1047,7 +1047,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $51, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
@@ -1067,7 +1067,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1080,7 +1080,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $92, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
@@ -1101,7 +1101,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1115,7 +1115,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1129,7 +1129,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1143,7 +1143,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $-104, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1157,7 +1157,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $113, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1171,7 +1171,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $113, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1194,7 +1194,7 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $45, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1208,7 +1208,7 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $45, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
@@ -1229,7 +1229,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1242,7 +1242,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
@@ -1253,7 +1253,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1266,7 +1266,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -1277,7 +1277,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1290,7 +1290,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
@@ -1309,7 +1309,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1322,7 +1322,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
@@ -1342,7 +1342,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1356,7 +1356,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1369,7 +1369,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1383,7 +1383,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1396,7 +1396,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1410,7 +1410,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1432,7 +1432,7 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1446,7 +1446,7 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
@@ -1468,7 +1468,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1481,7 +1481,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1493,7 +1493,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1506,7 +1506,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1518,7 +1518,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1531,7 +1531,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -1551,7 +1551,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1564,7 +1564,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -1585,7 +1585,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1599,7 +1599,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1613,7 +1613,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1627,7 +1627,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1641,7 +1641,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1655,7 +1655,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1]
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1678,7 +1678,7 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1692,7 +1692,7 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3]
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
@@ -1713,7 +1713,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-15, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1726,7 +1726,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-15, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
@@ -1737,7 +1737,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-17, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1750,7 +1750,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-17, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
@@ -1761,7 +1761,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1774,7 +1774,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-24, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
@@ -1793,7 +1793,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1806,7 +1806,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
@@ -1826,7 +1826,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1840,7 +1840,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -1853,7 +1853,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1867,7 +1867,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -1880,7 +1880,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $42, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1894,7 +1894,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $42, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
@@ -1916,7 +1916,7 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1930,7 +1930,7 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/shuffle.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s
 
 define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
 ; CHECK-LABEL: test_16xi8_perm_mask0:

Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/unpack.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/unpack.ll?rev=312474&r1=312473&r2=312474&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/unpack.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/unpack.ll Mon Sep  4 02:31:32 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
 
 define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
 ; CHECK-LABEL: test_4xfloat_unpack_low_mask0:
@@ -13,7 +13,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -26,7 +26,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -37,7 +37,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -50,7 +50,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -61,7 +61,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -74,7 +74,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -93,7 +93,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -106,7 +106,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -126,7 +126,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -140,7 +140,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -153,7 +153,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -167,7 +167,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -180,7 +180,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -194,7 +194,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -216,7 +216,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -230,7 +230,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -251,7 +251,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $122, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -264,7 +264,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $122, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -275,7 +275,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -288,7 +288,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-107, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -299,7 +299,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-25, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -312,7 +312,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-25, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -331,7 +331,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-127, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -344,7 +344,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-127, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
@@ -364,7 +364,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -378,7 +378,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -391,7 +391,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -405,7 +405,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -418,7 +418,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -432,7 +432,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -454,7 +454,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -468,7 +468,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -489,7 +489,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -502,7 +502,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -513,7 +513,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -526,7 +526,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -537,7 +537,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -550,7 +550,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -569,7 +569,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -582,7 +582,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
@@ -602,7 +602,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $20326, %ax # imm = 0x4F66
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -616,7 +616,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $20326, %ax # imm = 0x4F66
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -629,7 +629,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-17707, %ax # imm = 0xBAD5
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -643,7 +643,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-17707, %ax # imm = 0xBAD5
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -656,7 +656,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6631, %ax # imm = 0xE619
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -670,7 +670,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6631, %ax # imm = 0xE619
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -692,7 +692,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-20711, %ax # imm = 0xAF19
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -706,7 +706,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-20711, %ax # imm = 0xAF19
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -727,7 +727,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -740,7 +740,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -751,7 +751,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -764,7 +764,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
@@ -784,7 +784,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -798,7 +798,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -811,7 +811,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -825,7 +825,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -846,7 +846,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -859,7 +859,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -870,7 +870,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -883,7 +883,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -894,7 +894,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -907,7 +907,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -926,7 +926,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -939,7 +939,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -959,7 +959,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -973,7 +973,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -986,7 +986,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1000,7 +1000,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -1013,7 +1013,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1027,7 +1027,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -1049,7 +1049,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1063,7 +1063,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -1084,7 +1084,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-73, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1097,7 +1097,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-73, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1108,7 +1108,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1121,7 +1121,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $102, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1132,7 +1132,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-46, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1145,7 +1145,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-46, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1164,7 +1164,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1177,7 +1177,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-86, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
@@ -1197,7 +1197,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1211,7 +1211,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -1224,7 +1224,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1238,7 +1238,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -1251,7 +1251,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1265,7 +1265,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -1287,7 +1287,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1301,7 +1301,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -1322,7 +1322,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -1335,7 +1335,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1346,7 +1346,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -1359,7 +1359,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1370,7 +1370,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -1383,7 +1383,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1402,7 +1402,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -1415,7 +1415,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -1435,7 +1435,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -1449,7 +1449,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -1462,7 +1462,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -1476,7 +1476,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -1489,7 +1489,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -1503,7 +1503,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -1525,7 +1525,7 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -1539,7 +1539,7 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x float>, <4 x float>* %vec2p
@@ -1560,7 +1560,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $21, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1573,7 +1573,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $21, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -1584,7 +1584,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1597,7 +1597,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $82, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -1608,7 +1608,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1621,7 +1621,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -1640,7 +1640,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-19, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -1653,7 +1653,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-19, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
@@ -1673,7 +1673,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $28, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1687,7 +1687,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $28, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -1700,7 +1700,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-115, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1714,7 +1714,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-115, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -1727,7 +1727,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1741,7 +1741,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -1763,7 +1763,7 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -1777,7 +1777,7 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x float>, <8 x float>* %vec2p
@@ -1798,7 +1798,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1811,7 +1811,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -1822,7 +1822,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1835,7 +1835,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -1846,7 +1846,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-2371, %ax # imm = 0xF6BD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1859,7 +1859,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-2371, %ax # imm = 0xF6BD
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -1878,7 +1878,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26006, %ax # imm = 0x9A6A
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1891,7 +1891,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26006, %ax # imm = 0x9A6A
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
@@ -1911,7 +1911,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1925,7 +1925,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -1938,7 +1938,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1952,7 +1952,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -1965,7 +1965,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1979,7 +1979,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -2001,7 +2001,7 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2015,7 +2015,7 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
 ; CHECK-NEXT:    retq
   %vec2 = load <16 x float>, <16 x float>* %vec2p
@@ -2036,7 +2036,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -2049,7 +2049,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -2060,7 +2060,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -2073,7 +2073,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
@@ -2093,7 +2093,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -2107,7 +2107,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -2120,7 +2120,7 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -2134,7 +2134,7 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
 ; CHECK-NEXT:    retq
   %vec2 = load <2 x double>, <2 x double>* %vec2p
@@ -2155,7 +2155,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2168,7 +2168,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2179,7 +2179,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2192,7 +2192,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2203,7 +2203,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2216,7 +2216,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2235,7 +2235,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -2248,7 +2248,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
@@ -2268,7 +2268,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2282,7 +2282,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -2295,7 +2295,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2309,7 +2309,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -2322,7 +2322,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2336,7 +2336,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -2358,7 +2358,7 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -2372,7 +2372,7 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
 ; CHECK-NEXT:    retq
   %vec2 = load <4 x double>, <4 x double>* %vec2p
@@ -2393,7 +2393,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-27, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2406,7 +2406,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-27, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2417,7 +2417,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-21, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2430,7 +2430,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-21, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2441,7 +2441,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-118, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2454,7 +2454,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-118, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2473,7 +2473,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $100, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2486,7 +2486,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $100, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -2506,7 +2506,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2520,7 +2520,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -2533,7 +2533,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $71, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2547,7 +2547,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $71, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -2560,7 +2560,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2574,7 +2574,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-49, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p
@@ -2596,7 +2596,7 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-40, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2610,7 +2610,7 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-40, %al
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
 ; CHECK-NEXT:    retq
   %vec2 = load <8 x double>, <8 x double>* %vec2p




More information about the llvm-commits mailing list