[llvm] r258045 - AVX512 : Change v8i1 bitconvert GR8 pattern, remove unnecessary movzbl instruction.

Igor Breger via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 18 04:02:48 PST 2016


Author: ibreger
Date: Mon Jan 18 06:02:45 2016
New Revision: 258045

URL: http://llvm.org/viewvc/llvm-project?rev=258045&view=rev
Log:
AVX512 : Change v8i1 bitconvert GR8 pattern, remove unnecessary movzbl instruction.
code example , previous implementation.
    movzbl  %dil, %eax
    kmovw  %eax, %k0
  new code
    kmovw  %edi, %k0

Differential Revision: http://reviews.llvm.org/D16287

Modified:
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll
    llvm/trunk/test/CodeGen/X86/avx512-ext.ll
    llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512-select.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512cdvl-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
    llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Mon Jan 18 06:02:45 2016
@@ -2172,7 +2172,7 @@ let Predicates = [HasAVX512, NoDQI] in {
   // GR from/to 8-bit mask without native support
   def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
             (COPY_TO_REGCLASS
-             (KMOVWkr (MOVZX32rr8 GR8 :$src)), VK8)>;
+             (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), VK8)>;
   def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
             (EXTRACT_SUBREG
               (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),

Modified: llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-calling-conv.ll Mon Jan 18 06:02:45 2016
@@ -277,7 +277,6 @@ define <8 x i1> @test7a(<8 x i32>%a, <8
 ; KNL-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    movb $85, %al
-; KNL-NEXT:    movzbl %al, %eax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k1 {%k1}
 ; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
@@ -313,7 +312,6 @@ define <8 x i1> @test7a(<8 x i32>%a, <8
 ; KNL_X32-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; KNL_X32-NEXT:    vpsllvq LCPI7_0, %zmm0, %zmm0
 ; KNL_X32-NEXT:    movb $85, %al
-; KNL_X32-NEXT:    movzbl %al, %eax
 ; KNL_X32-NEXT:    kmovw %eax, %k1
 ; KNL_X32-NEXT:    vptestmq %zmm0, %zmm0, %k1 {%k1}
 ; KNL_X32-NEXT:    vpbroadcastd LCPI7_1, %zmm0

Modified: llvm/trunk/test/CodeGen/X86/avx512-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-ext.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-ext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll Mon Jan 18 06:02:45 2016
@@ -1312,8 +1312,7 @@ define   <16 x i32> @zext_16i1_to_16xi32
 define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
 ; KNL-LABEL: zext_8i1_to_8xi64:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movzbl %dil, %eax
-; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kmovw %edi, %k1
 ; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
 ; KNL-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-fma-intrinsics.ll Mon Jan 18 06:02:45 2016
@@ -1,78 +1,104 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
 
 declare <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
 declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
 
 define <16 x float> @test_x86_vfnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_x86_vfnmadd_ps_z
-  ; CHECK: vfnmadd213ps %zmm
+; CHECK-LABEL: test_x86_vfnmadd_ps_z:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <16 x float> @test_mask_vfnmadd_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_vfnmadd_ps
-  ; CHECK: vfnmadd213ps %zmm
+; CHECK-LABEL: test_mask_vfnmadd_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmadd213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
   ret <16 x float> %res
 }
 
 define <8 x double> @test_x86_vfnmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_x86_vfnmadd_pd_z
-  ; CHECK: vfnmadd213pd %zmm
+; CHECK-LABEL: test_x86_vfnmadd_pd_z:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <8 x double> @test_mask_vfnmadd_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_vfnmadd_pd
-  ; CHECK: vfnmadd213pd %zmm
+; CHECK-LABEL: test_mask_vfnmadd_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
   ret <8 x double> %res
 }
 
 define <16 x float> @test_x86_vfnmsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_x86_vfnmsubps_z
-  ; CHECK: vfnmsub213ps %zmm
+; CHECK-LABEL: test_x86_vfnmsubps_z:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <16 x float> @test_mask_vfnmsub_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_vfnmsub_ps
-  ; CHECK: vfnmsub213ps %zmm
+; CHECK-LABEL: test_mask_vfnmsub_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmsub213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
   ret <16 x float> %res
 }
 
 define <8 x double> @test_x86_vfnmsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_x86_vfnmsubpd_z
-  ; CHECK: vfnmsub213pd %zmm
+; CHECK-LABEL: test_x86_vfnmsubpd_z:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <8 x double> @test_mask_vfnmsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_vfnmsub_pd
-  ; CHECK: vfnmsub213pd %zmm
+; CHECK-LABEL: test_mask_vfnmsub_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
   ret <8 x double> %res
 }
 
 define <16 x float> @test_x86_vfmaddsubps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_x86_vfmaddsubps_z
-  ; CHECK: vfmaddsub213ps %zmm
+; CHECK-LABEL: test_x86_vfmaddsubps_z:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_fmaddsub_ps(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; CHECK-LABEL: test_mask_fmaddsub_ps:
-; CHECK: vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa6,0xc2]
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmaddsub213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 4)
   ret <16 x float> %res
 }
@@ -80,16 +106,21 @@ define <16 x float> @test_mask_fmaddsub_
 declare <16 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_x86_vfmaddsubpd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_x86_vfmaddsubpd_z
-  ; CHECK: vfmaddsub213pd %zmm
+; CHECK-LABEL: test_x86_vfmaddsubpd_z:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <8 x double> @test_mask_vfmaddsub_pd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_vfmaddsub_pd
-  ; CHECK: vfmaddsub213pd %zmm
+; CHECK-LABEL: test_mask_vfmaddsub_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
   ret <8 x double> %res
 }
@@ -97,8 +128,7 @@ define <8 x double> @test_mask_vfmaddsub
 define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -115,8 +145,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -133,8 +162,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
 ; CHECK-NEXT:    vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -200,8 +228,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1}
 ; CHECK-NEXT:    vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -231,71 +258,96 @@ define <16 x float>@test_int_x86_avx512_
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne
-  ; CHECK: vfmadd213ps  {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rne:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 0) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn
-  ; CHECK: vfmadd213ps  {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x39,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 1) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp
-  ; CHECK: vfmadd213ps  {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x59,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtp:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 2) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrb_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz
-  ; CHECK: vfmadd213ps  {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x79,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_rtz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 3) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrb_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current
-  ; CHECK: vfmadd213ps  %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrb_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rne(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne
-  ; CHECK: vfmadd213ps  {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rne:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 0) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtn(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn
-  ; CHECK: vfmadd213ps  {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x38,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 1) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtp(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp
-  ; CHECK: vfmadd213ps  {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x58,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtp:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 2) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_rtz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz
-  ; CHECK: vfmadd213ps  {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x78,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_rtz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 3) nounwind
   ret <16 x float> %res
 }
 
 define <16 x float> @test_mask_round_vfmadd512_ps_rrbz_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current
-  ; CHECK: vfmadd213ps  %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_ps_rrbz_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vfmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) nounwind
   ret <16 x float> %res
 }
@@ -305,8 +357,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
 ; CHECK-NEXT:    vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -336,71 +387,96 @@ define <16 x float>@test_int_x86_avx512_
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne
-  ; CHECK: vfmadd213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rne:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn
-  ; CHECK: vfmadd213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp
-  ; CHECK: vfmadd213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtp:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz
-  ; CHECK: vfmadd213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_rtz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current
-  ; CHECK: vfmadd213pd  %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrb_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne
-  ; CHECK: vfmadd213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rne:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn
-  ; CHECK: vfmadd213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp
-  ; CHECK: vfmadd213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtp:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz
-  ; CHECK: vfmadd213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_rtz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfmadd512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current
-  ; CHECK: vfmadd213pd  %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
+; CHECK-LABEL: test_mask_round_vfmadd512_pd_rrbz_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
@@ -408,8 +484,7 @@ define <8 x double> @test_mask_round_vfm
 define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
 ; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -426,8 +501,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1}
 ; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -444,8 +518,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z}
 ; CHECK-NEXT:    vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -508,71 +581,96 @@ define <16 x float>@test_int_x86_avx512_
 
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne
-  ; CHECK: vfnmsub213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x19,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rne:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 0) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn
-  ; CHECK: vfnmsub213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x39,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 1) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp
-  ; CHECK: vfnmsub213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x59,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtp:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 2) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz
-  ; CHECK: vfnmsub213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x79,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_rtz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 3) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrb_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current
-  ; CHECK: vfnmsub213pd  %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrb_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rne(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne
-  ; CHECK: vfnmsub213pd  {rn-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x18,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rne:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 0) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtn(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn
-  ; CHECK: vfnmsub213pd  {rd-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x38,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmsub213pd {rd-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 1) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtp(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp
-  ; CHECK: vfnmsub213pd  {ru-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x58,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtp:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmsub213pd {ru-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 2) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_rtz(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz
-  ; CHECK: vfnmsub213pd  {rz-sae}, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x78,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_rtz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmsub213pd {rz-sae}, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 3) nounwind
   ret <8 x double> %res
 }
 
 define <8 x double> @test_mask_round_vfnmsub512_pd_rrbz_current(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current
-  ; CHECK: vfnmsub213pd  %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xae,0xc2]
+; CHECK-LABEL: test_mask_round_vfnmsub512_pd_rrbz_current:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vfnmsub.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 -1, i32 4) nounwind
   ret <8 x double> %res
 }
@@ -580,8 +678,7 @@ define <8 x double> @test_mask_round_vfn
 define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -598,8 +695,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0
@@ -646,8 +742,7 @@ define <16 x float>@test_int_x86_avx512_
 define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3){
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1}
 ; CHECK-NEXT:    vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Mon Jan 18 06:02:45 2016
@@ -516,15 +516,18 @@ declare <8 x double> @llvm.x86.avx512.vb
 
 define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
-; CHECK: kmovw   %edi, %k1
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastss %xmm0, %zmm0
-; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0
-
-  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) 
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) 
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) 
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+
+  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
   %res3 = fadd <16 x float> %res, %res1
   %res4 = fadd <16 x float> %res2, %res3
   ret <16 x float> %res4
@@ -534,15 +537,18 @@ declare <16 x float> @llvm.x86.avx512.ma
 
 define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
 ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
-; CHECK: kmovw   %eax, %k1
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0
-; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0
-
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) 
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) 
-  %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) 
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
   %res3 = fadd <8 x double> %res, %res1
   %res4 = fadd <8 x double> %res2, %res3
   ret <8 x double> %res4
@@ -581,8 +587,7 @@ declare <16 x i32> @llvm.x86.avx512.pbro
 define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0
@@ -643,8 +648,7 @@ define <16 x i32> @test_maskz_conflict_d
 define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_conflict_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpconflictq %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -689,8 +693,7 @@ define <16 x i32> @test_mask_lzcnt_d(<16
 define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_lzcnt_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -713,8 +716,7 @@ declare <16 x float> @llvm.x86.avx512.ma
 define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
 ; CHECK-LABEL: test_x86_mask_blend_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vblendmpd %zmm1, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
@@ -724,8 +726,7 @@ define <8 x double> @test_x86_mask_blend
 define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vblendmpd (%rdi), %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %b = load <8 x double>, <8 x double>* %ptr
@@ -748,8 +749,7 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
 ; CHECK-LABEL: test_x86_mask_blend_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpblendmq %zmm1, %zmm0, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
@@ -825,8 +825,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpabsq %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpabsq %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -946,8 +945,7 @@ declare <16 x float> @llvm.x86.avx512.ma
 define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_pd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovapd (%rdi), %zmm0
 ; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    vmovapd (%rdi), %zmm1 {%k1} {z}
@@ -961,12 +959,11 @@ define <8 x double> @test_mask_load_alig
 }
 
 declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
-    
+
 define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_unaligned_pd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovupd (%rdi), %zmm0
 ; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    vmovupd (%rdi), %zmm1 {%k1} {z}
@@ -993,8 +990,7 @@ define <8 x i64> @test_valign_q(<8 x i64
 define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
 ; CHECK-LABEL: test_mask_valign_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1064,8 +1060,7 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <
 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    retq
@@ -1111,8 +1106,7 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <
 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    retq
@@ -1374,8 +1368,7 @@ define <8 x i8> @test_cmp_q_512(<8 x i64
 define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_cmp_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %r8d
 ; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k0 {%k1}
@@ -1488,8 +1481,7 @@ define <8 x i8> @test_ucmp_q_512(<8 x i6
 define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %r8d
 ; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
@@ -1635,8 +1627,7 @@ define <8 x i64> @test_x86_avx512_pslli_
 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1647,8 +1638,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
@@ -1701,8 +1691,7 @@ define <8 x i64> @test_x86_avx512_psrli_
 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1713,8 +1702,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
@@ -1767,8 +1755,7 @@ define <8 x i64> @test_x86_avx512_psrai_
 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -1779,8 +1766,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
@@ -1833,8 +1819,7 @@ define <8 x i64> @test_x86_avx512_psll_q
 define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psll_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1845,8 +1830,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -1899,8 +1883,7 @@ define <8 x i64> @test_x86_avx512_psrl_q
 define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1911,8 +1894,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -1965,8 +1947,7 @@ define <8 x i64> @test_x86_avx512_psra_q
 define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psra_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -1977,8 +1958,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2031,8 +2011,7 @@ define <8 x i64> @test_x86_avx512_psllv_
 define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2043,8 +2022,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2098,8 +2076,7 @@ define <8 x i64> @test_x86_avx512_psrav_
 define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2110,8 +2087,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2164,8 +2140,7 @@ define <8 x i64> @test_x86_avx512_psrlv_
 define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2176,8 +2151,7 @@ define <8 x i64> @test_x86_avx512_mask_p
 define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
@@ -2378,8 +2352,7 @@ define <16 x float> @test_vmulps_mask_pa
 define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
 ; CHECK-LABEL: test_vmulpd_mask_rn:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmulpd {rn-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2390,8 +2363,7 @@ define <8 x double> @test_vmulpd_mask_rn
 define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
 ; CHECK-LABEL: test_vmulpd_mask_rd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmulpd {rd-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2402,8 +2374,7 @@ define <8 x double> @test_vmulpd_mask_rd
 define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
 ; CHECK-LABEL: test_vmulpd_mask_ru:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmulpd {ru-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2414,8 +2385,7 @@ define <8 x double> @test_vmulpd_mask_ru
 define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) {
 ; CHECK-LABEL: test_vmulpd_mask_rz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmulpd {rz-sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
@@ -2501,8 +2471,7 @@ define <8 x i64> @test_xor_epi64(<8 x i6
 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_epi64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2524,8 +2493,7 @@ define <8 x i64> @test_or_epi64(<8 x i64
 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_epi64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2547,8 +2515,7 @@ define <8 x i64> @test_and_epi64(<8 x i6
 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_epi64:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2779,8 +2746,7 @@ define <8 x i64> @test_mask_add_epi64_rr
 define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi64_rrk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2791,8 +2757,7 @@ define <8 x i64> @test_mask_add_epi64_rr
 define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi64_rrkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -2812,8 +2777,7 @@ define <8 x i64> @test_mask_add_epi64_rm
 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi64_rmk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2825,8 +2789,7 @@ define <8 x i64> @test_mask_add_epi64_rm
 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi64_rmkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -2849,8 +2812,7 @@ define <8 x i64> @test_mask_add_epi64_rm
 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi64_rmbk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2864,8 +2826,7 @@ define <8 x i64> @test_mask_add_epi64_rm
 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi64_rmbkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
@@ -2889,8 +2850,7 @@ define <8 x i64> @test_mask_sub_epi64_rr
 define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi64_rrk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -2901,8 +2861,7 @@ define <8 x i64> @test_mask_sub_epi64_rr
 define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi64_rrkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -2922,8 +2881,7 @@ define <8 x i64> @test_mask_sub_epi64_rm
 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi64_rmk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2935,8 +2893,7 @@ define <8 x i64> @test_mask_sub_epi64_rm
 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi64_rmkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -2959,8 +2916,7 @@ define <8 x i64> @test_mask_sub_epi64_rm
 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi64_rmbk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -2974,8 +2930,7 @@ define <8 x i64> @test_mask_sub_epi64_rm
 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
@@ -2999,8 +2954,7 @@ define <8 x i64> @test_mask_mul_epi32_rr
 define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epi32_rrk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -3011,8 +2965,7 @@ define <8 x i64> @test_mask_mul_epi32_rr
 define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epi32_rrkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -3032,8 +2985,7 @@ define <8 x i64> @test_mask_mul_epi32_rm
 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epi32_rmk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -3045,8 +2997,7 @@ define <8 x i64> @test_mask_mul_epi32_rm
 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epi32_rmkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -3070,8 +3021,7 @@ define <8 x i64> @test_mask_mul_epi32_rm
 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epi32_rmbk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -3086,8 +3036,7 @@ define <8 x i64> @test_mask_mul_epi32_rm
 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
@@ -3112,8 +3061,7 @@ define <8 x i64> @test_mask_mul_epu32_rr
 define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epu32_rrk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm0
 ; CHECK-NEXT:    retq
@@ -3124,8 +3072,7 @@ define <8 x i64> @test_mask_mul_epu32_rr
 define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epu32_rrkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -3145,8 +3092,7 @@ define <8 x i64> @test_mask_mul_epu32_rm
 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epu32_rmk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -3158,8 +3104,7 @@ define <8 x i64> @test_mask_mul_epu32_rm
 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epu32_rmkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %b = load <16 x i32>, <16 x i32>* %ptr_b
@@ -3183,8 +3128,7 @@ define <8 x i64> @test_mask_mul_epu32_rm
 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epu32_rmbk:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -3199,8 +3143,7 @@ define <8 x i64> @test_mask_mul_epu32_rm
 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %q = load i64, i64* %ptr_b
@@ -4314,8 +4257,7 @@ define <16 x i32>@test_int_x86_avx512_ma
 define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
@@ -4347,8 +4289,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
@@ -4380,8 +4321,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
@@ -4411,8 +4351,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
@@ -4446,8 +4385,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
 ; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
 ; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1
@@ -4481,8 +4419,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
 ; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
 ; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
@@ -4517,8 +4454,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm2
 ; CHECK-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1
@@ -4556,8 +4492,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
 ; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
@@ -4590,8 +4525,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_scalef_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vscalefpd {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vscalefpd {rn-sae}, %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
@@ -4622,8 +4556,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
@@ -4655,8 +4588,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
@@ -4688,8 +4620,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 = zmm2[0],k1[0],zmm2[2],k1[2],zmm2[4],k1[4],zmm2[6],k1[6]
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = k1[0],zmm0[0],k1[2],zmm0[2],k1[4],zmm0[4],k1[6],zmm0[6]
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -4709,8 +4640,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm2 = zmm2[1],k1[1],zmm2[3],k1[3],zmm2[5],k1[5],zmm2[7],k1[7]
 ; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
@@ -4778,8 +4708,7 @@ declare void @llvm.x86.avx512.mask.pmov.
 define void @test_int_x86_avx512_mask_pmov_qb_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_mem_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    retq
@@ -4861,8 +4790,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
@@ -4882,8 +4810,7 @@ declare void @llvm.x86.avx512.mask.pmov.
 define void @test_int_x86_avx512_mask_pmov_qw_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_mem_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    retq
@@ -4897,8 +4824,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
@@ -4932,8 +4858,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
@@ -4967,8 +4892,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
@@ -4988,8 +4912,7 @@ declare void @llvm.x86.avx512.mask.pmov.
 define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_mem_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    retq
@@ -5003,8 +4926,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
@@ -5038,8 +4960,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
@@ -5277,8 +5198,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
@@ -5310,8 +5230,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2dq %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2dq {rn-sae}, %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -5327,8 +5246,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2ps {ru-sae}, %zmm0, %ymm0
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -5344,8 +5262,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2udq {ru-sae}, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2udq {rn-sae}, %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -5377,8 +5294,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtps2pd {sae}, %ymm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
@@ -5410,8 +5326,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2dq {sae}, %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -5427,8 +5342,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
@@ -5461,8 +5375,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2udq {sae}, %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -5703,8 +5616,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
@@ -5741,8 +5653,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[4,5,2,3],zmm1[2,3,0,1]
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
@@ -5758,8 +5669,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantpd $11, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vgetmantpd $11, {sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
@@ -5842,8 +5752,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 = zmm2[0],k1[1],zmm2[3],k1[2],zmm2[5],k1[4],zmm2[6],k1[6]
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 = k1[0],zmm0[1],k1[3],zmm0[2],k1[5],zmm0[4],k1[6],zmm0[6]
 ; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
@@ -5880,8 +5789,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 = zmm1[0,1,3,2,5,4,6,6]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm2 = k1[0,1,3,2,5,4,6,6]
 ; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 = zmm0[0,1,3,2,5,4,6,6]
@@ -5921,8 +5829,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3 {%k1} {z}
 ; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0
@@ -6002,8 +5909,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
 ; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6023,8 +5929,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1} {z}
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6112,8 +6017,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
@@ -6130,8 +6034,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz
 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
 ; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
@@ -6188,8 +6091,7 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
 ; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
@@ -6377,12 +6279,14 @@ declare <16 x float> @llvm.x86.avx512.ma
 
 define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff32x4 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vaddps %zmm1, %zmm0, %zmm0
-; CHECK: vaddps %zmm0, %zmm2, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
 
   %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
@@ -6396,12 +6300,14 @@ declare <8 x double> @llvm.x86.avx512.ma
 
 define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshuff64x2 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vaddpd %zmm1, %zmm0, %zmm0
-; CHECK: vaddpd %zmm0, %zmm2, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
 
   %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
@@ -6415,12 +6321,14 @@ declare <16 x i32> @llvm.x86.avx512.mask
 
 define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi32x4 $0, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddd %zmm1, %zmm0, %zmm0
-; CHECK: vpaddd %zmm0, %zmm2, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
 
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
@@ -6434,12 +6342,14 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 
 define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
-; CHECK: kmovw %eax, %k1
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm2 {%k1} {z}
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm1 {%k1}
-; CHECK: vshufi64x2 $68, %zmm0, %zmm0, %zmm0
-; CHECK: vpaddq %zmm1, %zmm0, %zmm0
-; CHECK: vpaddq %zmm0, %zmm2, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
 
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
@@ -6454,8 +6364,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrlq $255, %zmm0, %zmm0
@@ -6515,8 +6424,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm0
@@ -6556,8 +6464,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm0
@@ -6577,13 +6484,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i16 %x1, <16 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 
-; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpshufd $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 	%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 %x3)
 	%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> zeroinitializer, i8 %x3)
 	%res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i16 3, <16 x i32> %x2, i8 -1)
@@ -6597,13 +6504,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 
-; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z} 
-; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
@@ -6617,14 +6524,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z} 
-; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -6638,13 +6544,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 
-; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vprold $3, %zmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vprold $3, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vprold $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vprold $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> zeroinitializer, i16 %x3)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i8 3, <16 x i32> %x2, i16 -1)
@@ -6658,14 +6564,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
@@ -6697,8 +6602,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_q:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1}
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 {%k1} {z}
@@ -6716,13 +6620,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxbd %xmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -6736,14 +6640,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxbq %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -6757,14 +6660,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxdq %ymm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -6778,13 +6680,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 
-; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxwd %ymm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -6798,14 +6700,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxwq %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -6819,13 +6720,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -6839,14 +6740,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -6860,14 +6760,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -6882,13 +6781,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 
-; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -6903,14 +6802,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -6924,14 +6822,13 @@ declare <8 x double> @llvm.x86.avx512.ma
 define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i8 %x1, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermpd $3, %zmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpermpd $3, %zmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpermpd $3, %zmm0, %zmm0 
-; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpermpd $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpermpd $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpermpd $3, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i8 3, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i8 3, <8 x double> zeroinitializer, i8 %x3)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i8 3, <8 x double> %x2, i8 -1)
@@ -6945,14 +6842,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i8 %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermq $3, %zmm0, %zmm1 {%k1} 
-; CHECK-NEXT:    vpermq $3, %zmm0, %zmm2 {%k1} {z} 
-; CHECK-NEXT:    vpermq $3, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpermq $3, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpermq $3, %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpermq $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i8 3, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i8 3, <8 x i64> %x2, i8 -1)
@@ -6962,18 +6858,17 @@ define <8 x i64>@test_int_x86_avx512_mas
 }
 
 declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
- 
+
 define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermpd %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vpermpd %zmm1, %zmm0, %zmm3 {%k1} {z} 
-; CHECK-NEXT:    vpermpd %zmm1, %zmm0, %zmm0 
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1 
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpermpd %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -6987,14 +6882,13 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermq %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vpermq %zmm1, %zmm0, %zmm3 {%k1} {z} 
-; CHECK-NEXT:    vpermq %zmm1, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1 
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermq %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpermq %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -7010,13 +6904,13 @@ declare <16 x float> @llvm.x86.avx512.ma
 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 
-; CHECK-NEXT:    vpermps %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vpermps %zmm1, %zmm0, %zmm3 {%k1} {z} 
-; CHECK-NEXT:    vpermps %zmm1, %zmm0, %zmm0 
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1 
-; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpermps %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -7030,13 +6924,13 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 
-; CHECK-NEXT:    vpermd %zmm1, %zmm0, %zmm2 {%k1} 
-; CHECK-NEXT:    vpermd %zmm1, %zmm0, %zmm3 {%k1} {z} 
-; CHECK-NEXT:    vpermd %zmm1, %zmm0, %zmm0 
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1 
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpermd %zmm1, %zmm0, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vpermd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i8 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i8 %x3)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i8 -1)

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Mon Jan 18 06:02:45 2016
@@ -18,8 +18,7 @@ define i16 @mask16(i16 %x) {
 define i8 @mask8(i8 %x) {
 ; KNL-LABEL: mask8:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movzbl %dil, %eax
-; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    knotw %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    retq
@@ -232,7 +231,6 @@ define void @test7(<8 x i1> %mask)  {
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    movb $85, %al
-; KNL-NEXT:    movzbl %al, %eax
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1337,8 +1335,7 @@ define <64 x i8> @test17(i64 %x, i32 %y,
 define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-LABEL: test18:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movzbl %dil, %eax
-; KNL-NEXT:    kmovw %eax, %k0
+; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kshiftlw $7, %k1, %k2
 ; KNL-NEXT:    kshiftrw $15, %k2, %k2

Modified: llvm/trunk/test/CodeGen/X86/avx512-select.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-select.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-select.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-select.ll Mon Jan 18 06:02:45 2016
@@ -1,62 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s
 
-; CHECK-LABEL: select00
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
 define <16 x i32> @select00(i32 %a, <16 x i32> %b) nounwind {
+; CHECK-LABEL: select00:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    cmpl $255, %edi
+; CHECK-NEXT:    je LBB0_2
+; CHECK-NEXT:  ## BB#1:
+; CHECK-NEXT:    vmovaps %zmm0, %zmm1
+; CHECK-NEXT:  LBB0_2:
+; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %cmpres = icmp eq i32 %a, 255
   %selres = select i1 %cmpres, <16 x i32> zeroinitializer, <16 x i32> %b
   %res = xor <16 x i32> %b, %selres
   ret <16 x i32> %res
 }
 
-; CHECK-LABEL: select01
-; CHECK: vmovaps
-; CHECK-NEXT: LBB
 define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
+; CHECK-LABEL: select01:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    cmpl $255, %edi
+; CHECK-NEXT:    je LBB1_2
+; CHECK-NEXT:  ## BB#1:
+; CHECK-NEXT:    vmovaps %zmm0, %zmm1
+; CHECK-NEXT:  LBB1_2:
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
   %cmpres = icmp eq i32 %a, 255
   %selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b
   %res = xor <8 x i64> %b, %selres
   ret <8 x i64> %res
 }
 
-; CHECK-LABEL: @select02
-; CHECK: cmpless %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovss  %xmm2, {{.*}}%xmm1 {%k1}
-; CHECK: ret
 define float @select02(float %a, float %b, float %c, float %eps) {
+; CHECK-LABEL: select02:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmpless %xmm0, %xmm3, %k1
+; CHECK-NEXT:    vmovss %xmm2, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %cmp = fcmp oge float %a, %eps
   %cond = select i1 %cmp, float %c, float %b
   ret float %cond
 }
 
-; CHECK-LABEL: @select03
-; CHECK: cmplesd %xmm0, %xmm3, %k1
-; CHECK-NEXT: vmovsd  %xmm2, {{.*}}%xmm1 {%k1}
-; CHECK: ret
 define double @select03(double %a, double %b, double %c, double %eps) {
+; CHECK-LABEL: select03:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcmplesd %xmm0, %xmm3, %k1
+; CHECK-NEXT:    vmovsd %xmm2, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
   %cmp = fcmp oge double %a, %eps
   %cond = select i1 %cmp, double %c, double %b
   ret double %cond
 }
 
-; CHECK-LABEL: @select04
-; CHECK: vmovaps %zmm3, %zmm1
-; CHECK-NEXT: ret
-; PR20677
 define <16 x double> @select04(<16 x double> %a, <16 x double> %b) {
+; CHECK-LABEL: select04:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovaps %zmm3, %zmm1
+; CHECK-NEXT:    retq
   %sel = select <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x double> %a, <16 x double> %b
   ret <16 x double> %sel
 }
 
-; CHECK-LABEL: select05
-; CHECK: movzbl  %sil, %eax
-; CHECK: kmovw   %eax, %k0
-; CHECK: movzbl  %dil, %eax
-; CHECK: kmovw   %eax, %k1
-; CHECK-NEXT: korw    %k1, %k0, %k0
-; CHECK-NEXT: kmovw   %k0, %eax
 define i8 @select05(i8 %a.0, i8 %m) {
+; CHECK-LABEL: select05:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
   %r = select <8 x i1> %mask, <8 x i1> <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>, <8 x i1> %a
@@ -64,14 +83,14 @@ define i8 @select05(i8 %a.0, i8 %m) {
   ret i8 %res;
 }
 
-; CHECK-LABEL: select06
-; CHECK: movzbl  %sil, %eax
-; CHECK: kmovw   %eax, %k0
-; CHECK: movzbl  %dil, %eax
-; CHECK: kmovw   %eax, %k1
-; CHECK-NEXT: kandw    %k1, %k0, %k0
-; CHECK-NEXT: kmovw   %k0, %eax
 define i8 @select06(i8 %a.0, i8 %m) {
+; CHECK-LABEL: select06:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kandw %k1, %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
   %r = select <8 x i1> %mask, <8 x i1> %a, <8 x i1> zeroinitializer
@@ -79,19 +98,18 @@ define i8 @select06(i8 %a.0, i8 %m) {
   ret i8 %res;
 }
 
-; CHECK-LABEL: select07
-; CHECK-DAG: movzbl  %dl, %eax
-; CHECK-DAG: kmovw   %eax, %k0
-; CHECK-DAG: movzbl  %dil, %eax
-; CHECK-DAG: kmovw   %eax, %k1
-; CHECK-DAG: movzbl  %sil, %eax
-; CHECK-DAG: kmovw   %eax, %k2
-; CHECK: kandw %k0, %k1, %k1
-; CHECK-NEXT: knotw    %k0, %k0
-; CHECK-NEXT: kandw    %k0, %k2, %k0
-; CHECK-NEXT: korw %k0, %k1, %k0
-; CHECK-NEXT: kmovw   %k0, %eax
 define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
+; CHECK-LABEL: select07:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kmovw %esi, %k2
+; CHECK-NEXT:    kandw %k0, %k1, %k1
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kandw %k0, %k2, %k0
+; CHECK-NEXT:    korw %k0, %k1, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
   %b = bitcast i8 %b.0 to <8 x i1>

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll Mon Jan 18 06:02:45 2016
@@ -651,8 +651,7 @@ define <2 x double> @test_mask_fmadd128_
 define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
@@ -669,8 +668,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
@@ -687,8 +685,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0
@@ -703,8 +700,7 @@ define <2 x double>@test_int_x86_avx512_
 define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
@@ -721,8 +717,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
@@ -739,8 +734,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0
@@ -755,8 +749,7 @@ define <4 x double>@test_int_x86_avx512_
 define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
@@ -773,8 +766,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
@@ -791,8 +783,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0
@@ -807,8 +798,7 @@ define <4 x float>@test_int_x86_avx512_m
 define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
@@ -825,8 +815,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
@@ -843,8 +832,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0
@@ -862,8 +850,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0
@@ -881,8 +868,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0
@@ -899,8 +885,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0
@@ -917,8 +902,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0
@@ -1006,8 +990,7 @@ define <2 x double> @test_mask_vfnmsub12
 define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
@@ -1024,8 +1007,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0
@@ -1040,8 +1022,7 @@ define <2 x double>@test_int_x86_avx512_
 define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
@@ -1058,8 +1039,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0
@@ -1074,8 +1054,7 @@ define <4 x double>@test_int_x86_avx512_
 define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
@@ -1092,8 +1071,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0
@@ -1108,8 +1086,7 @@ define <4 x float>@test_int_x86_avx512_m
 define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
@@ -1126,8 +1103,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0
@@ -1142,8 +1118,7 @@ define <8 x float>@test_int_x86_avx512_m
 define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0
@@ -1158,8 +1133,7 @@ define <2 x double>@test_int_x86_avx512_
 define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0
@@ -1174,8 +1148,7 @@ define <4 x double>@test_int_x86_avx512_
 define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
@@ -1190,8 +1163,7 @@ define <4 x float>@test_int_x86_avx512_m
 define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
@@ -1242,8 +1214,7 @@ define <2 x double> @test_mask_vfmaddsub
 define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
@@ -1260,8 +1231,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
@@ -1278,8 +1248,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0
@@ -1294,8 +1263,7 @@ define <2 x double>@test_int_x86_avx512_
 define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
@@ -1312,8 +1280,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
@@ -1330,8 +1297,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0
@@ -1346,8 +1312,7 @@ define <4 x double>@test_int_x86_avx512_
 define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
@@ -1364,8 +1329,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
@@ -1382,8 +1346,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0
@@ -1398,8 +1361,7 @@ define <4 x float>@test_int_x86_avx512_m
 define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
@@ -1416,8 +1378,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
@@ -1434,8 +1395,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0
@@ -1452,8 +1412,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0
@@ -1470,8 +1429,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0
@@ -1488,8 +1446,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0
@@ -1506,8 +1463,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm2, %zmm3
 ; CHECK-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1}
 ; CHECK-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0
@@ -4037,8 +3993,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
@@ -4054,8 +4009,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
@@ -4071,8 +4025,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0
@@ -4256,8 +4209,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0
@@ -4357,8 +4309,7 @@ declare <8 x i16> @llvm.x86.avx512.pbroa
 define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
@@ -4490,8 +4441,7 @@ declare <8 x i16> @llvm.x86.avx512.cvtma
 define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8 %x0)
@@ -4515,8 +4465,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
@@ -4556,8 +4505,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm0
@@ -4617,8 +4565,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
@@ -4638,8 +4585,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
@@ -4659,8 +4605,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm0
@@ -4720,8 +4665,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i16 %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %esi, %k1 
 ; CHECK-NEXT:    vpshufd $3, %xmm0, %xmm1 {%k1} 
 ; CHECK-NEXT:    vpshufd $3, %xmm0, %xmm2 {%k1} {z} 
 ; CHECK-NEXT:    vpshufd $3, %xmm0, %xmm0 
@@ -4742,8 +4686,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i16 %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %esi, %k1 
 ; CHECK-NEXT:    vpshufd $3, %ymm0, %ymm1 {%k1} 
 ; CHECK-NEXT:    vpshufd $3, %ymm0, %ymm2 {%k1} {z} 
 ; CHECK-NEXT:    vpshufd $3, %ymm0, %ymm0 
@@ -4764,8 +4707,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pshufh_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm0
@@ -4807,8 +4749,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pshufl_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %esi, %k1 
 ; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm1 {%k1} 
 ; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm2 {%k1} {z} 
 ; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm0 
@@ -4870,8 +4811,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
@@ -4892,8 +4832,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
@@ -4933,8 +4872,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i8 %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm0
@@ -4994,8 +4932,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
@@ -5015,8 +4952,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm1 {%k1} 
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm2 {%k1} {z} 
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm0 
@@ -5057,8 +4993,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm1 {%k1} 
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm2 {%k1} {z} 
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0 
@@ -5098,8 +5033,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm1 {%k1} 
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm2 {%k1} {z} 
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0 
@@ -5119,8 +5053,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm1 {%k1} 
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm2 {%k1} {z} 
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0 
@@ -5140,8 +5073,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.
 define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermw %xmm1, %xmm0, %xmm2 {%k1} 
 ; CHECK-NEXT:    vpermw %xmm1, %xmm0, %xmm3 {%k1} {z} 
 ; CHECK-NEXT:    vpermw %xmm1, %xmm0, %xmm0 

Modified: llvm/trunk/test/CodeGen/X86/avx512cdvl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512cdvl-intrinsics.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512cdvl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512cdvl-intrinsics.ll Mon Jan 18 06:02:45 2016
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
 
 declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly
@@ -7,8 +8,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm0
@@ -28,8 +28,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vplzcntd %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -45,8 +44,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vplzcntq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
@@ -62,8 +60,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntq %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vplzcntq %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
@@ -79,8 +76,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpconflictd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpconflictd %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpconflictd %xmm0, %xmm0
@@ -100,8 +96,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_vpconflict_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpconflictd %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpconflictd %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -117,8 +112,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_vpconflict_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpconflictq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpconflictq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
@@ -134,8 +128,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_vpconflict_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpconflictq %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpconflictq %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
@@ -147,33 +140,45 @@ define <4 x i64>@test_int_x86_avx512_mas
 }
 
 define <8 x i32> @test_x86_vbroadcastmw_256(i16 %a0) {
-  ; CHECK: test_x86_vbroadcastmw_256
-  ; CHECK: vpbroadcastmw2d %k0, %ymm0
-  %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ; 
+; CHECK-LABEL: test_x86_vbroadcastmw_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpbroadcastmw2d %k0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16 %a0) ;
   ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx512.broadcastmw.256(i16)
 
 define <4 x i32> @test_x86_vbroadcastmw_128(i16 %a0) {
-  ; CHECK: test_x86_vbroadcastmw_128
-  ; CHECK: vpbroadcastmw2d %k0, %xmm0
-  %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ; 
+; CHECK-LABEL: test_x86_vbroadcastmw_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpbroadcastmw2d %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16 %a0) ;
   ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.avx512.broadcastmw.128(i16)
 
 define <4 x i64> @test_x86_broadcastmb_256(i8 %a0) {
-  ; CHECK: test_x86_broadcastmb_256
-  ; CHECK: vpbroadcastmb2q %k0, %ymm0
-  %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ; 
+; CHECK-LABEL: test_x86_broadcastmb_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpbroadcastmb2q %k0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8 %a0) ;
   ret <4 x i64> %res
 }
 declare <4 x i64> @llvm.x86.avx512.broadcastmb.256(i8)
 
 define <2 x i64> @test_x86_broadcastmb_128(i8 %a0) {
-  ; CHECK: test_x86_broadcastmb_128
-  ; CHECK: vpbroadcastmb2q %k0, %xmm0
-  %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ; 
+; CHECK-LABEL: test_x86_broadcastmb_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpbroadcastmb2q %k0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8 %a0) ;
   ret <2 x i64> %res
 }
 declare <2 x i64> @llvm.x86.avx512.broadcastmb.128(i8)

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll Mon Jan 18 06:02:45 2016
@@ -4016,8 +4016,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
@@ -4033,8 +4032,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
@@ -4050,8 +4048,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -4067,8 +4064,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -4084,8 +4080,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2dq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4101,8 +4096,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4118,8 +4112,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -4135,8 +4128,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ps:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2ps %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -4152,8 +4144,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2udq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4169,8 +4160,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2udq %ymm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4186,8 +4176,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4203,8 +4192,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -4220,8 +4208,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm0
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
@@ -4237,8 +4224,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
@@ -4254,8 +4240,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtps2udq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4271,8 +4256,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtps2udq %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -4288,8 +4272,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4305,8 +4288,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4322,8 +4304,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2udq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4339,8 +4320,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4356,8 +4336,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvttps2dq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4373,8 +4352,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -4390,8 +4368,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -4407,8 +4384,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
@@ -4424,8 +4400,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtudq2pd %xmm0, %xmm0
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
@@ -4441,8 +4416,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
@@ -4458,8 +4432,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -4475,8 +4448,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -4545,8 +4517,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
@@ -4569,8 +4540,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    ## ymm2 = ymm0[0,1],ymm1[2,3]
 ; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z}
@@ -4593,8 +4563,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufi32x4 $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    ## ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vshufi32x4 $22, %ymm1, %ymm0, %ymm0
@@ -4612,8 +4581,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufi64x2 $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    ## ymm2 = ymm0[0,1],ymm1[2,3]
 ; CHECK-NEXT:    vshufi64x2 $22, %ymm1, %ymm0, %ymm0
@@ -4651,8 +4619,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm0
@@ -4672,8 +4639,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_getmant_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantpd $11, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vgetmantpd $11, %ymm0, %ymm0
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
@@ -4689,8 +4655,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_getmant_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantps $11, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vgetmantps $11, %xmm0, %xmm0
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
@@ -4706,8 +4671,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_getmant_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vgetmantps $11, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vgetmantps $11, %ymm0, %ymm0
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -4723,8 +4687,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    ## xmm2 = xmm2[0],k1[1]
 ; CHECK-NEXT:    vshufpd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
@@ -4747,8 +4710,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufpd $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    ## ymm2 = ymm2[0],k1[1],ymm2[3],k1[2]
 ; CHECK-NEXT:    vshufpd $22, %ymm1, %ymm0, %ymm0
@@ -4766,8 +4728,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufps $22, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    ## xmm2 = xmm2[2,1],k1[1,0]
 ; CHECK-NEXT:    vshufps $22, %xmm1, %xmm0, %xmm0
@@ -4785,8 +4746,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufps $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    ## ymm2 = ymm2[2,1],k1[1,0],ymm2[6,5],k1[5,4]
 ; CHECK-NEXT:    vshufps $22, %ymm1, %ymm0, %ymm0
@@ -4804,8 +4764,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    valignd $22, %xmm1, %xmm0, %xmm0
@@ -4825,8 +4784,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_valign_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    valignd $22, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0
@@ -4842,8 +4800,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignq $22, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    valignq $22, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0
@@ -4859,8 +4816,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_valign_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignq $22, %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    valignq $22, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
@@ -4876,8 +4832,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    ## ymm1 = ymm1[0,1,3,2]
 ; CHECK-NEXT:    vpermilpd $22, %ymm0, %ymm2 {%k1} {z}
@@ -4900,8 +4855,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    ## xmm1 = xmm1[1,0]
 ; CHECK-NEXT:    vpermilpd $1, %xmm0, %xmm2 {%k1} {z}
@@ -4924,8 +4878,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    ## ymm1 = ymm1[2,1,1,0,6,5,5,4]
 ; CHECK-NEXT:    vpermilps $22, %ymm0, %ymm2 {%k1} {z}
@@ -4948,8 +4901,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    ## xmm1 = xmm1[2,1,1,0]
 ; CHECK-NEXT:    vpermilps $22, %xmm0, %xmm2 {%k1} {z}
@@ -4972,8 +4924,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
@@ -4993,8 +4944,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
@@ -5014,8 +4964,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
@@ -5035,8 +4984,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
@@ -5056,8 +5004,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %ymm0, %ymm0
@@ -5077,8 +5024,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %ymm0, %ymm0
@@ -5099,8 +5045,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0
@@ -5117,8 +5062,7 @@ declare <4 x i32> @llvm.x86.avx512.maskz
 define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0
@@ -5135,8 +5079,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0
@@ -5153,8 +5096,7 @@ declare <8 x i32> @llvm.x86.avx512.maskz
 define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0
@@ -5171,8 +5113,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1}
 ; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0
@@ -5189,8 +5130,7 @@ declare <2 x i64> @llvm.x86.avx512.maskz
 define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0
@@ -5207,8 +5147,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1}
 ; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0
@@ -5225,8 +5164,7 @@ declare <4 x i64> @llvm.x86.avx512.maskz
 define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
 ; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0
@@ -5243,8 +5181,7 @@ declare <8 x i32> @llvm.x86.avx512.pbroa
 define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %ymm0
@@ -5264,8 +5201,7 @@ declare <4 x i32> @llvm.x86.avx512.pbroa
 define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %xmm0
@@ -5285,8 +5221,7 @@ declare <4 x i64> @llvm.x86.avx512.pbroa
 define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
@@ -5306,8 +5241,7 @@ declare <2 x i64> @llvm.x86.avx512.pbroa
 define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
@@ -5324,7 +5258,7 @@ define <2 x i64>@test_int_x86_avx512_pbr
 
 define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
   ; CHECK: test_x86_vcvtph2ps_128
-  ; CHECK: vcvtph2ps  %xmm0, %xmm0    
+  ; CHECK: vcvtph2ps  %xmm0, %xmm0
   %res = call <4 x float> @llvm.x86.avx512.mask.vcvtph2ps.128(<8 x i16> %a0, <4 x float> zeroinitializer, i8 -1)
   ret <4 x float> %res
 }
@@ -5355,7 +5289,7 @@ define <8 x float> @test_x86_vcvtph2ps_2
 
 define <8 x float> @test_x86_vcvtph2ps_256_rrk(<8 x i16> %a0,<8 x float> %a1, i8 %mask) {
   ; CHECK: test_x86_vcvtph2ps_256_rrk
-  ; CHECK: vcvtph2ps  %xmm0, %ymm1 {%k1} 
+  ; CHECK: vcvtph2ps  %xmm0, %ymm1 {%k1}
   %res = call <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16> %a0, <8 x float> %a1, i8 %mask)
   ret <8 x float> %res
 }
@@ -5393,17 +5327,16 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vmovsldup %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsldup %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    ## xmm1 = xmm0[0,0,2,2]
-; CHECK-NEXT:    vmovsldup %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vmovsldup %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    ## xmm2 = xmm0[0,0,2,2]
-; CHECK-NEXT:    vmovsldup %xmm0, %xmm0 
+; CHECK-NEXT:    vmovsldup %xmm0, %xmm0
 ; CHECK-NEXT:    ## xmm0 = xmm0[0,0,2,2]
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
   %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
@@ -5417,17 +5350,16 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vmovsldup %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsldup %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    ## ymm1 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vmovsldup %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vmovsldup %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    ## ymm2 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vmovsldup %ymm0, %ymm0 
+; CHECK-NEXT:    vmovsldup %ymm0, %ymm0
 ; CHECK-NEXT:    ## ymm0 = ymm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
   %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
@@ -5441,17 +5373,16 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vmovshdup %xmm0, %xmm1 {%k1} 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovshdup %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    ## xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    vmovshdup %xmm0, %xmm2 {%k1} {z} 
+; CHECK-NEXT:    vmovshdup %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    ## xmm2 = xmm0[1,1,3,3]
-; CHECK-NEXT:    vmovshdup %xmm0, %xmm0 
+; CHECK-NEXT:    vmovshdup %xmm0, %xmm0
 ; CHECK-NEXT:    ## xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1)
   %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2)
@@ -5465,17 +5396,16 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vmovshdup %ymm0, %ymm1 {%k1} 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovshdup %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    ## ymm1 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT:    vmovshdup %ymm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vmovshdup %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    ## ymm2 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT:    vmovshdup %ymm0, %ymm0 
+; CHECK-NEXT:    vmovshdup %ymm0, %ymm0
 ; CHECK-NEXT:    ## ymm0 = ymm0[1,1,3,3,5,5,7,7]
-; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1)
   %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2)
@@ -5488,8 +5418,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovddup %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    ## xmm1 = xmm0[0,0]
 ; CHECK-NEXT:    vmovddup %xmm0, %xmm2 {%k1} {z}
@@ -5512,8 +5441,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_movddup_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovddup %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    ## ymm1 = ymm0[0,0,2,2]
 ; CHECK-NEXT:    vmovddup %ymm0, %ymm2 {%k1} {z}
@@ -5714,15 +5642,15 @@ declare <2 x double> @llvm.x86.avx512.rc
 
 define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) {
 ; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256:
-; CHECK: kmovw   %eax, %k1
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1}
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-
-  %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) 
-  %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) 
-  %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) 
+; CHECK: kmovw   %edi, %k1
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm2 {%k1} {z} 
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+
+  %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask)
+  %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask)
   %res3 = fadd <4 x double> %res, %res1
   %res4 = fadd <4 x double> %res2, %res3
   ret <4 x double> %res4
@@ -5731,15 +5659,15 @@ declare <4 x double> @llvm.x86.avx512.ma
 
 define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) {
 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256:
-; CHECK: kmovw   %eax, %k1
+; CHECK: kmovw   %edi, %k1
 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1}
 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT: vbroadcastss %xmm0, %ymm0
 ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0
 
-  %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) 
-  %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) 
-  %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) 
+  %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask)
+  %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
   %res3 = fadd <8 x float> %res, %res1
   %res4 = fadd <8 x float> %res2, %res3
   ret <8 x float> %res4
@@ -5748,15 +5676,15 @@ declare <8 x float> @llvm.x86.avx512.mas
 
 define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) {
 ; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128:
-; CHECK: kmovw   %eax, %k1
+; CHECK: kmovw   %edi, %k1
 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT: vbroadcastss %xmm0, %xmm0
 ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
 
-  %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) 
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) 
-  %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) 
+  %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask)
   %res3 = fadd <4 x float> %res, %res1
   %res4 = fadd <4 x float> %res2, %res3
   ret <4 x float> %res4
@@ -5768,7 +5696,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 
 define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256:
-; CHECK: kmovw %eax, %k1
+; CHECK: kmovw %edi, %k1
 ; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
 ; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
 ; CHECK: vshuff32x4 $0, %ymm0, %ymm0, %ymm0
@@ -5787,7 +5715,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 
 define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256:
-; CHECK: kmovw %eax, %k1
+; CHECK: kmovw %edi, %k1
 ; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm2 {%k1} {z}
 ; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm1 {%k1}
 ; CHECK: vshufi32x4 $0, %ymm0, %ymm0, %ymm0
@@ -5807,8 +5735,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
@@ -5828,8 +5755,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
@@ -5849,8 +5775,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrlq $255, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsrlq $255, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrlq $255, %xmm0, %xmm0
@@ -5870,8 +5795,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrlq $255, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpsrlq $255, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrlq $255, %ymm0, %ymm0
@@ -5889,8 +5813,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
@@ -5910,8 +5833,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
@@ -5931,8 +5853,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrld $255, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsrld $255, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrld $255, %xmm0, %xmm0
@@ -5952,8 +5873,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrld $255, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpsrld $255, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrld $255, %ymm0, %ymm0
@@ -5993,8 +5913,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psrlv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv2_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
@@ -6014,8 +5933,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psrlv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
@@ -6035,8 +5953,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psrlv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv4_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
@@ -6056,8 +5973,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psrlv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
@@ -6077,8 +5993,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psra_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrad %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsrad %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
@@ -6098,8 +6013,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psra_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
@@ -6119,8 +6033,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psra_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrad $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsrad $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrad $3, %xmm0, %xmm0
@@ -6140,8 +6053,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psra_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
@@ -6161,8 +6073,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psra_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsraq %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsraq %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
@@ -6182,8 +6093,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psra_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsraq %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsraq %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsraq %xmm1, %ymm0, %ymm0
@@ -6203,8 +6113,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psra_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsraq $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsraq $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsraq $3, %xmm0, %xmm0
@@ -6224,8 +6133,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psra_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsraq $3, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpsraq $3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpsraq $3, %ymm0, %ymm0
@@ -6246,8 +6154,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psll_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpslld %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpslld %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpslld %xmm1, %xmm0, %xmm0
@@ -6267,8 +6174,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psll_d_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
@@ -6288,8 +6194,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psll_di_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpslld $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpslld $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpslld $3, %xmm0, %xmm0
@@ -6309,8 +6214,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psll_di_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
@@ -6330,8 +6234,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psll_q_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
@@ -6351,8 +6254,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psll_qi_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsllq $3, %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpsllq $3, %xmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpsllq $3, %xmm0, %xmm0
@@ -6372,8 +6274,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
@@ -6391,8 +6292,7 @@ define <4 x i64>@test_int_x86_avx512_mas
 define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1 {%k1} {z}
@@ -6410,8 +6310,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_unaligned_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovups (%rdi), %ymm0
 ; CHECK-NEXT:    vmovups (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    vmovups (%rdi), %ymm1 {%k1} {z}
@@ -6429,8 +6328,7 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm0
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    vmovapd (%rdi), %ymm1 {%k1} {z}
@@ -6448,8 +6346,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_unaligned_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovupd (%rdi), %ymm0
 ; CHECK-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    vmovupd (%rdi), %ymm1 {%k1} {z}
@@ -6467,8 +6364,7 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm0
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    vmovaps (%rdi), %xmm1 {%k1} {z}
@@ -6486,8 +6382,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_unaligned_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovups (%rdi), %xmm0
 ; CHECK-NEXT:    vmovups (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    vmovups (%rdi), %xmm1 {%k1} {z}
@@ -6505,8 +6400,7 @@ declare <4 x float> @llvm.x86.avx512.mas
 define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovapd (%rdi), %xmm0
 ; CHECK-NEXT:    vmovapd (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    vmovapd (%rdi), %xmm1 {%k1} {z}
@@ -6524,8 +6418,7 @@ declare <2 x double> @llvm.x86.avx512.ma
 define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_unaligned_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm0
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm1 {%k1} {z}
@@ -6545,8 +6438,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrav4_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
@@ -6566,8 +6458,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psrav8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
@@ -6587,8 +6478,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psrav_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsravq %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsravq %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
@@ -6608,8 +6498,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrav_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsravq %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsravq %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
@@ -6629,8 +6518,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_psllv2_di(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psllv2_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
@@ -6650,8 +6538,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_psllv4_di(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
@@ -6671,8 +6558,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_psllv4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psllv4_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
@@ -6692,8 +6578,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm2 {%k1}
 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm3 {%k1} {z}
 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
@@ -6713,14 +6598,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm2 {%k1} 
-; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z} 
-; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
   %res2 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -6734,14 +6618,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6755,14 +6638,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm2 {%k1} 
-; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z} 
-; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -6776,14 +6658,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -6796,14 +6677,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprold $3, %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vprold $3, %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vprold $3, %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprold $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vprold $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vprold $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
   %res2 = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
@@ -6817,14 +6697,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i8 %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprold $3, %ymm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vprold $3, %ymm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vprold $3, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprold $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vprold $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vprold $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
@@ -6838,14 +6717,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprolq $3, %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vprolq $3, %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vprolq $3, %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprolq $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vprolq $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vprolq $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
@@ -6859,14 +6737,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prol_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprolq $3, %ymm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vprolq $3, %ymm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vprolq $3, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprolq $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vprolq $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vprolq $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
@@ -6880,8 +6757,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32> @test_mask_load_aligned_d_128(<4 x i32> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %xmm0
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %xmm1 {%k1} {z}
@@ -6899,8 +6775,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32> @test_mask_load_aligned_d_256(<8 x i32> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm0
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %ymm1 {%k1} {z}
@@ -6918,8 +6793,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64> @test_mask_load_aligned_q_128(<2 x i64> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %xmm0
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %xmm0 {%k1}
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %xmm1 {%k1} {z}
@@ -6937,8 +6811,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %mask) {
 ; CHECK-LABEL: test_mask_load_aligned_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm0
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm0 {%k1}
 ; CHECK-NEXT:    vmovdqa64 (%rdi), %ymm1 {%k1} {z}
@@ -6956,11 +6829,10 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z}
-; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 
+; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -6977,14 +6849,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax
-; CHECK-NEXT:    kmovw %eax, %k1
-; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -6998,14 +6869,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm2 {%k1} 
-; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z} 
-; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
@@ -7019,14 +6889,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -7040,14 +6909,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprord $3, %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vprord $3, %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vprord $3, %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprord $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vprord $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vprord $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> zeroinitializer, i8 %x3)
   %res2 = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i8 3, <4 x i32> %x2, i8 -1)
@@ -7061,14 +6929,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pror_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprord $3, %ymm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vprord $3, %ymm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vprord $3, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprord $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vprord $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vprord $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> zeroinitializer, i8 %x3)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i8 3, <8 x i32> %x2, i8 -1)
@@ -7082,14 +6949,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i8 %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprorq $3, %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vprorq $3, %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vprorq $3, %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprorq $3, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vprorq $3, %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vprorq $3, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> zeroinitializer, i8 %x3)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i8 3, <2 x i64> %x2, i8 -1)
@@ -7103,14 +6969,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pror_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vprorq $3, %ymm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vprorq $3, %ymm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vprorq $3, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vprorq $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vprorq $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vprorq $3, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
@@ -7124,14 +6989,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxbd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
   %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
@@ -7145,14 +7009,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxbd %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
@@ -7166,14 +7029,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxbq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
@@ -7187,14 +7049,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxbq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
@@ -7208,14 +7069,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxdq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1)
@@ -7229,14 +7089,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxdq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1)
@@ -7250,14 +7109,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxwd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
   %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
@@ -7271,14 +7129,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxwd %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
@@ -7292,14 +7149,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxwq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
@@ -7313,14 +7169,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovzxwq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
@@ -7334,14 +7189,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2)
   %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1)
@@ -7355,14 +7209,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1)
@@ -7376,14 +7229,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1)
@@ -7397,14 +7249,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1)
@@ -7418,14 +7269,13 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2)
   %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1)
@@ -7439,14 +7289,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1)
@@ -7460,14 +7309,13 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm1 {%k1} 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0 
-; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1 
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2)
   %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1)
@@ -7481,14 +7329,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1)
@@ -7502,15 +7349,14 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i8 %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermpd $3, %ymm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpermpd $3, %ymm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpermpd $3, %ymm0, %ymm0 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpermpd $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpermpd $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpermpd $3, %ymm0, %ymm0
 ; CHECK-NEXT:    ## ymm0 = ymm0[3,0,0,0]
-; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vaddpd %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i8 3, <4 x double> %x2, i8 %x3)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i8 3, <4 x double> zeroinitializer, i8 %x3)
   %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i8 3, <4 x double> %x2, i8 -1)
@@ -7524,15 +7370,14 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i8 %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %sil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermq $3, %ymm0, %ymm1 {%k1} 
-; CHECK-NEXT:    vpermq $3, %ymm0, %ymm2 {%k1} {z} 
-; CHECK-NEXT:    vpermq $3, %ymm0, %ymm0 
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vpermq $3, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpermq $3, %ymm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpermq $3, %ymm0, %ymm0
 ; CHECK-NEXT:    ## ymm0 = ymm0[3,0,0,0]
-; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i8 3, <4 x i64> zeroinitializer, i8 %x3)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i8 3, <4 x i64> %x2, i8 -1)
@@ -7545,14 +7390,13 @@ declare <4 x double> @llvm.x86.avx512.ma
 define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermpd %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vpermpd %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vpermpd %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermpd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpermpd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpermpd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
   %res2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
@@ -7566,14 +7410,13 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermq %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vpermq %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vpermq %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermq %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpermq %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpermq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
   %res2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
@@ -7589,14 +7432,13 @@ declare <8 x float> @llvm.x86.avx512.mas
 define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermps %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vpermps %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vpermps %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermps %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpermps %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
   %res2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@@ -7610,14 +7452,13 @@ declare <8 x i32> @llvm.x86.avx512.mask.
 define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movzbl %dil, %eax 
-; CHECK-NEXT:    kmovw %eax, %k1 
-; CHECK-NEXT:    vpermd %ymm1, %ymm0, %ymm2 {%k1} 
-; CHECK-NEXT:    vpermd %ymm1, %ymm0, %ymm3 {%k1} {z} 
-; CHECK-NEXT:    vpermd %ymm1, %ymm0, %ymm0 
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 
-; CHECK-NEXT:    retq 
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermd %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vpermd %ymm1, %ymm0, %ymm3 {%k1} {z}
+; CHECK-NEXT:    vpermd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
   %res2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)

Modified: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll Mon Jan 18 06:02:45 2016
@@ -279,8 +279,7 @@ define <8 x i32> @test7(i32* %base, <8 x
 ;
 ; KNL_64-LABEL: test7:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    movzbl %sil, %eax
-; KNL_64-NEXT:    kmovw %eax, %k1
+; KNL_64-NEXT:    kmovw %esi, %k1
 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
 ; KNL_64-NEXT:    kmovw %k1, %k2
 ; KNL_64-NEXT:    vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2}
@@ -1128,7 +1127,6 @@ define <2 x i32> @test24(i32* %base, <2
 ; KNL_64-LABEL: test24:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    movb $3, %al
-; KNL_64-NEXT:    movzbl %al, %eax
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
@@ -1215,7 +1213,6 @@ define <2 x i64> @test26(i64* %base, <2
 ; KNL_64-LABEL: test26:
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    movb $3, %al
-; KNL_64-NEXT:    movzbl %al, %eax
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
 ; KNL_64-NEXT:    vmovaps %zmm1, %zmm0
@@ -1260,7 +1257,6 @@ define <2 x float> @test27(float* %base,
 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm1
 ; KNL_64-NEXT:    movb $3, %al
-; KNL_64-NEXT:    movzbl %al, %eax
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
 ; KNL_64-NEXT:    retq
@@ -1271,7 +1267,6 @@ define <2 x float> @test27(float* %base,
 ; KNL_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm1
 ; KNL_32-NEXT:    movb $3, %cl
-; KNL_32-NEXT:    movzbl %cl, %ecx
 ; KNL_32-NEXT:    kmovw %ecx, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
 ; KNL_32-NEXT:    retl
@@ -1297,7 +1292,6 @@ define void @test28(<2 x i32>%a1, <2 x i
 ; KNL_64:       # BB#0:
 ; KNL_64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; KNL_64-NEXT:    movb $3, %al
-; KNL_64-NEXT:    movzbl %al, %eax
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
 ; KNL_64-NEXT:    retq

Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll?rev=258045&r1=258044&r2=258045&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-v1.ll Mon Jan 18 06:02:45 2016
@@ -162,8 +162,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7
 define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    movzbl %dil, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
@@ -192,8 +191,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u
 define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    movzbl %dil, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
@@ -223,8 +221,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %
 define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    movzbl %dil, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
@@ -250,8 +247,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a
 define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    movzbl %dil, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
@@ -281,8 +277,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a
 define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
 ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    movzbl %dil, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
 ; AVX512F-NEXT:    vpxord %zmm2, %zmm2, %zmm2
@@ -312,10 +307,8 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %
 define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
 ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    movzbl %dil, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    movb $51, %al
-; AVX512F-NEXT:    movzbl %al, %eax
 ; AVX512F-NEXT:    kmovw %eax, %k2
 ; AVX512F-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, %zmm1 {%k2} {z}




More information about the llvm-commits mailing list