[llvm] r333843 - [X86][AVX512] Cleanup intrinsics tests

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 3 07:56:04 PDT 2018


Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll?rev=333843&r1=333842&r2=333843&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll Sun Jun  3 07:56:04 2018
@@ -1,31 +1,54 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 
 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
-; CHECK-LABEL: unpckbw_test:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq
+; X86-LABEL: unpckbw_test:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k0 ## encoding: [0xc5,0xf8,0x92,0xc0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    kunpckbw %k1, %k0, %k0 ## encoding: [0xc5,0xfd,0x4b,0xc1]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: unpckbw_test:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    kunpckbw %k1, %k0, %k0 ## encoding: [0xc5,0xfd,0x4b,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
   ret i16 %res
 }
 
 define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpbroadcastd %edi, %zmm1
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
-; CHECK-NEXT:    vpbroadcastd %edi, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastd %eax, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xc8]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7c,0xc0]
+; X86-NEXT:    vpbroadcastd %eax, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xd0]
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpbroadcastd %edi, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xcf]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpbroadcastd %edi, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7c,0xc7]
+; X64-NEXT:    vpbroadcastd %edi, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xd7]
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
     %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
     %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
     %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
@@ -37,15 +60,28 @@ declare <16 x i32> @llvm.x86.avx512.mask
 
 
 define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
-; CHECK-NEXT:    vpbroadcastq %rdi, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovq {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x4c,0x24,0x04]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd1]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc1]
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc9]
+; X86-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
+; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpbroadcastq %rdi, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7c,0xcf]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x7c,0xc7]
+; X64-NEXT:    vpbroadcastq %rdi, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7c,0xd7]
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
    %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
    %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
    %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
@@ -59,15 +95,25 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
 
 define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vbroadcastss %xmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
-; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_vbroadcast_ss_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vbroadcastss %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xd0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x18,0xc8]
+; X86-NEXT:    vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9]
+; X86-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0]
+; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vbroadcast_ss_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vbroadcastss %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x18,0xc8]
+; X64-NEXT:    vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9]
+; X64-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0]
+; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
@@ -80,15 +126,26 @@ define <16 x float> @test_x86_vbroadcast
 declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
 
 define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) {
-; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
-; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_vbroadcast_sd_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vbroadcastsd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x19,0xc8]
+; X86-NEXT:    vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9]
+; X86-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0]
+; X86-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vbroadcast_sd_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vbroadcastsd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x19,0xc8]
+; X64-NEXT:    vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9]
+; X64-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0]
+; X64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
@@ -101,15 +158,25 @@ define <8 x double> @test_x86_vbroadcast
 declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_pbroadcastd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpbroadcastd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xd0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0xc8]
+; X86-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_pbroadcastd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpbroadcastd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0xc8]
+; X64-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
   %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
@@ -121,15 +188,26 @@ define <16 x i32>@test_int_x86_avx512_pb
 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_pbroadcastq_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpbroadcastq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc8]
+; X86-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_pbroadcastq_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpbroadcastq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc8]
+; X64-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
   %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
   %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
@@ -141,15 +219,31 @@ define <8 x i64>@test_int_x86_avx512_pbr
 declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
-; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
-; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_movsldup_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovsldup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x12,0xd0]
+; X86-NEXT:    ## zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmovsldup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x12,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X86-NEXT:    vaddps %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xca]
+; X86-NEXT:    vmovsldup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x12,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_movsldup_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovsldup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x12,0xd0]
+; X64-NEXT:    ## zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovsldup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x12,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT:    vaddps %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xca]
+; X64-NEXT:    vmovsldup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x12,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
+; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
@@ -161,15 +255,31 @@ define <16 x float>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm1
-; CHECK-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_movshdup_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovshdup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x16,0xd0]
+; X86-NEXT:    ## zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmovshdup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x16,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X86-NEXT:    vaddps %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xca]
+; X86-NEXT:    vmovshdup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x16,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_movshdup_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovshdup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x16,0xd0]
+; X64-NEXT:    ## zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovshdup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x16,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT:    vaddps %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xca]
+; X64-NEXT:    vmovshdup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x16,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
@@ -181,15 +291,32 @@ define <16 x float>@test_int_x86_avx512_
 declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_movddup_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
-; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_movddup_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovddup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xd0]
+; X86-NEXT:    ## zmm2 = zmm0[0,0,2,2,4,4,6,6]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vmovddup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x49,0x12,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; X86-NEXT:    vaddpd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xca]
+; X86-NEXT:    vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; X86-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_movddup_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovddup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xd0]
+; X64-NEXT:    ## zmm2 = zmm0[0,0,2,2,4,4,6,6]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovddup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x49,0x12,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
+; X64-NEXT:    vaddpd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xca]
+; X64-NEXT:    vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
+; X64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
@@ -201,15 +328,32 @@ define <8 x double>@test_int_x86_avx512_
 declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_perm_df_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_perm_df_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermpd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xd0,0x03]
+; X86-NEXT:    ## zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermpd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x01,0xc8,0x03]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; X86-NEXT:    vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; X86-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X86-NEXT:    vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_perm_df_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermpd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xd0,0x03]
+; X64-NEXT:    ## zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpermpd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x01,0xc8,0x03]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; X64-NEXT:    vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; X64-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X64-NEXT:    vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
@@ -221,15 +365,32 @@ define <8 x double>@test_int_x86_avx512_
 declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_perm_di_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_perm_di_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xd0,0x03]
+; X86-NEXT:    ## zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x00,0xc8,0x03]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; X86-NEXT:    vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_perm_di_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xd0,0x03]
+; X64-NEXT:    ## zmm2 = zmm0[3,0,0,0,7,4,4,4]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpermq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x00,0xc8,0x03]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
+; X64-NEXT:    vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
@@ -239,13 +400,23 @@ define <8 x i64>@test_int_x86_avx512_mas
 }
 
 define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
-; CHECK-LABEL: test_store1:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovups %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_store1:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovups %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x11,0x01]
+; X86-NEXT:    vmovups %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_store1:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovups %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x11,0x07]
+; X64-NEXT:    vmovups %zmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
   ret void
@@ -254,13 +425,24 @@ define void @test_store1(<16 x float> %d
 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
 
 define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
-; CHECK-LABEL: test_store2:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovupd %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_store2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
+; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X86-NEXT:    vmovupd %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x11,0x01]
+; X86-NEXT:    vmovupd %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_store2:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovupd %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x11,0x07]
+; X64-NEXT:    vmovupd %zmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
   ret void
@@ -269,13 +451,23 @@ define void @test_store2(<8 x double> %d
 declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
 
 define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
-; CHECK-LABEL: test_mask_store_aligned_ps:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovaps %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_store_aligned_ps:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovaps %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x29,0x01]
+; X86-NEXT:    vmovaps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_store_aligned_ps:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovaps %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x29,0x07]
+; X64-NEXT:    vmovaps %zmm0, (%rsi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
   ret void
@@ -284,13 +476,24 @@ define void @test_mask_store_aligned_ps(
 declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
 
 define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
-; CHECK-LABEL: test_mask_store_aligned_pd:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovapd %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_store_aligned_pd:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
+; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X86-NEXT:    vmovapd %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x29,0x01]
+; X86-NEXT:    vmovapd %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x29,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_store_aligned_pd:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovapd %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x29,0x07]
+; X64-NEXT:    vmovapd %zmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x48,0x29,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
   ret void
@@ -299,13 +502,24 @@ define void @test_mask_store_aligned_pd(
 declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
 
 define void at test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovdqu64 %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovdqu64 %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_storeu_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
+; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X86-NEXT:    vmovdqu64 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x7f,0x01]
+; X86-NEXT:    vmovdqu64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_storeu_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovdqu64 %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x7f,0x07]
+; X64-NEXT:    vmovdqu64 %zmm0, (%rsi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
   call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
   ret void
@@ -314,13 +528,23 @@ define void at test_int_x86_avx512_mask_sto
 declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
 
 define void at test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovdqu64 %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_storeu_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovdqu32 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7f,0x01]
+; X86-NEXT:    vmovdqu64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_storeu_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7f,0x07]
+; X64-NEXT:    vmovdqu64 %zmm0, (%rsi) ## encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
   call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
   ret void
@@ -329,13 +553,24 @@ define void at test_int_x86_avx512_mask_sto
 declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
 
 define void at test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_store_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
+; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X86-NEXT:    vmovdqa64 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x7f,0x01]
+; X86-NEXT:    vmovdqa64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_store_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovdqa64 %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x7f,0x07]
+; X64-NEXT:    vmovdqa64 %zmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
   call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
   ret void
@@ -344,13 +579,23 @@ define void at test_int_x86_avx512_mask_sto
 declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
 
 define void at test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovdqa32 %zmm0, (%rdi) {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_store_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovdqa32 %zmm0, (%ecx) {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x7f,0x01]
+; X86-NEXT:    vmovdqa64 %zmm0, (%eax) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_store_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovdqa32 %zmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x7f,0x07]
+; X64-NEXT:    vmovdqa64 %zmm0, (%rsi) ## encoding: [0x62,0xf1,0xfd,0x48,0x7f,0x06]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
   call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
   ret void
@@ -359,14 +604,24 @@ define void at test_int_x86_avx512_mask_sto
 declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
 
 define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_ps:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovaps (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_aligned_ps:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovaps (%eax), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x00]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vmovaps (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x00]
+; X86-NEXT:    vmovaps (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x08]
+; X86-NEXT:    vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_aligned_ps:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovaps (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0x07]
+; X64-NEXT:    vmovaps (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0x0f]
+; X64-NEXT:    vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
@@ -377,14 +632,24 @@ define <16 x float> @test_mask_load_alig
 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
 
 define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_ps:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovups (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovups (%rdi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovups (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_unaligned_ps:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovups (%eax), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x00]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vmovups (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x00]
+; X86-NEXT:    vmovups (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x08]
+; X86-NEXT:    vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_unaligned_ps:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovups (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x10,0x07]
+; X64-NEXT:    vmovups (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x10,0x0f]
+; X64-NEXT:    vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
@@ -395,14 +660,25 @@ define <16 x float> @test_mask_load_unal
 declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
 
 define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_pd:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovapd (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_aligned_pd:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovapd (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vmovapd (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x00]
+; X86-NEXT:    vmovapd (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x08]
+; X86-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_aligned_pd:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovapd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0x07]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovapd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x28,0x07]
+; X64-NEXT:    vmovapd (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x28,0x0f]
+; X64-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
@@ -413,14 +689,25 @@ define <8 x double> @test_mask_load_alig
 declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
 
 define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_pd:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovupd (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovupd (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_unaligned_pd:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovupd (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vmovupd (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x00]
+; X86-NEXT:    vmovupd (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x08]
+; X86-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_unaligned_pd:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovupd (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x10,0x07]
+; X64-NEXT:    vmovupd (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x10,0x0f]
+; X64-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
@@ -433,14 +720,25 @@ declare <8 x double> @llvm.x86.avx512.ma
 declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
 
 define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovdqu32 (%rsi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_unaligned_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovdqu32 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x00]
+; X86-NEXT:    vmovdqu32 (%ecx), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x09]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_unaligned_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovdqu32 (%rsi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x6f,0x06]
+; X64-NEXT:    vmovdqu32 (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xc9,0x6f,0x0f]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
@@ -451,14 +749,26 @@ define <16 x i32> @test_mask_load_unalig
 declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
 
 define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
-; CHECK-LABEL: test_mask_load_unaligned_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %edx, %k1
-; CHECK-NEXT:    vmovdqu64 (%rsi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_unaligned_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx ## encoding: [0x0f,0xb6,0x54,0x24,0x0c]
+; X86-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X86-NEXT:    vmovdqu64 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x00]
+; X86-NEXT:    vmovdqu64 (%ecx), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x09]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_unaligned_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqu64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
+; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; X64-NEXT:    vmovdqu64 (%rsi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfe,0x49,0x6f,0x06]
+; X64-NEXT:    vmovdqu64 (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfe,0xc9,0x6f,0x0f]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
@@ -469,14 +779,24 @@ define <8 x i64> @test_mask_load_unalign
 declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8*, <16 x i32>, i16)
 
 define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, i8* %ptr, i16 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_aligned_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa64 (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x00]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqa32 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x00]
+; X86-NEXT:    vmovdqa32 (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x08]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_aligned_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0x07]
+; X64-NEXT:    vmovdqa32 (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0x0f]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> %res, i16 %mask)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
@@ -487,14 +807,25 @@ define <16 x i32> @test_mask_load_aligne
 declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8*, <8 x i64>, i8)
 
 define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) {
-; CHECK-LABEL: test_mask_load_aligned_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1}
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_load_aligned_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovdqa64 (%eax), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x00]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vmovdqa64 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x00]
+; X86-NEXT:    vmovdqa64 (%eax), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x08]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_load_aligned_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x07]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0x07]
+; X64-NEXT:    vmovdqa64 (%rdi), %zmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0x0f]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> %res, i8 %mask)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
@@ -505,15 +836,32 @@ define <8 x i64> @test_mask_load_aligned
 declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermilpd {{.*#+}} zmm2 = zmm0[0,1,3,2,5,4,6,6]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermilpd {{.*#+}} zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
-; CHECK-NEXT:    vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermilpd $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xd0,0x16]
+; X86-NEXT:    ## zmm2 = zmm0[0,1,3,2,5,4,6,6]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
+; X86-NEXT:    vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
+; X86-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X86-NEXT:    vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermilpd $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xd0,0x16]
+; X64-NEXT:    ## zmm2 = zmm0[0,1,3,2,5,4,6,6]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6]
+; X64-NEXT:    vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6]
+; X64-NEXT:    vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0]
+; X64-NEXT:    vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
@@ -525,15 +873,31 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
-; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermilps $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xd0,0x16]
+; X86-NEXT:    ## zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; X86-NEXT:    vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; X86-NEXT:    vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0]
+; X86-NEXT:    vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermilps $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xd0,0x16]
+; X64-NEXT:    ## zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; X64-NEXT:    vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12]
+; X64-NEXT:    vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0]
+; X64-NEXT:    vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
@@ -545,15 +909,31 @@ define <16 x float>@test_int_x86_avx512_
 declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpshufd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xd0,0x03]
+; X86-NEXT:    ## zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpshufd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x70,0xc8,0x03]
+; X86-NEXT:    ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; X86-NEXT:    vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pshuf_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpshufd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xd0,0x03]
+; X64-NEXT:    ## zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpshufd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x70,0xc8,0x03]
+; X64-NEXT:    ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; X64-NEXT:    vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
 	%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
 	%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
 	%res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
@@ -565,24 +945,33 @@ define <16 x i32>@test_int_x86_avx512_ma
 define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_pcmpeq_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
 }
 
 define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_pcmpeq_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpeq_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
 }
@@ -592,24 +981,33 @@ declare i16 @llvm.x86.avx512.mask.pcmpeq
 define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_pcmpeq_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
 }
 
 define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpeq_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andb %dil, %al
-; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_pcmpeq_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al ## encoding: [0x22,0x44,0x24,0x04]
+; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpeq_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andb %dil, %al ## encoding: [0x40,0x20,0xf8]
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
 }
@@ -619,24 +1017,33 @@ declare i8 @llvm.x86.avx512.mask.pcmpeq.
 define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_pcmpgt_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
 }
 
 define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl %edi, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_pcmpgt_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x04]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpgt_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andl %edi, %eax ## encoding: [0x21,0xf8]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
 }
@@ -646,24 +1053,33 @@ declare i16 @llvm.x86.avx512.mask.pcmpgt
 define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_pcmpgt_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xc1]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
 }
 
 define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_pcmpgt_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andb %dil, %al
-; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_pcmpgt_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xc1]
+; X86-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al ## encoding: [0x22,0x44,0x24,0x04]
+; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpgt_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andb %dil, %al ## encoding: [0x40,0x20,0xf8]
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
 }
@@ -673,13 +1089,26 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.
 declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vunpckhpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vunpckhpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x15,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X86-NEXT:    vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_unpckh_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vunpckhpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vunpckhpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x15,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT:    vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
   %res2 = fadd <8 x double> %res, %res1
@@ -689,13 +1118,25 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vunpckhps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vunpckhps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x15,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X86-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_unpckh_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vunpckhps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vunpckhps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x15,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -705,13 +1146,26 @@ define <16 x float>@test_int_x86_avx512_
 declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vunpcklpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vunpcklpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x14,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_unpckl_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vunpcklpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vunpcklpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x14,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
   %res2 = fadd <8 x double> %res, %res1
@@ -721,13 +1175,25 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vunpcklps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vunpcklps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x14,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X86-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_unpckl_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vunpcklps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vunpcklps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x14,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -737,15 +1203,32 @@ define <16 x float>@test_int_x86_avx512_
 declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpunpcklqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpunpcklqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6c,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0]
+; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpunpcklqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpunpcklqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6c,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0]
+; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
@@ -757,13 +1240,26 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpunpckhqdq {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpunpckhqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpunpckhqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6d,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpunpckhqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpunpckhqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6d,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -773,13 +1269,25 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpunpckhdq {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpunpckhdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpunpckhdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6a,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpckhd_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpunpckhdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpunpckhdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6a,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -789,13 +1297,25 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpunpckldq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpunpckldq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xd9]
+; X86-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpunpckldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x62,0xd1]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpckld_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpunpckldq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xd9]
+; X64-NEXT:    ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpunpckldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x62,0xd1]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -805,29 +1325,42 @@ define <16 x i32>@test_int_x86_avx512_ma
 define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) {
 ; CHECK-LABEL: test_x86_avx512_pslli_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xf0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_pslli_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_pslli_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xf0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_pslli_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpslld $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xf0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_pslli_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_pslli_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xf0,0x07]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_pslli_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpslld $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xf0,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -837,29 +1370,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0) {
 ; CHECK-LABEL: test_x86_avx512_pslli_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x73,0xf0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_pslli_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_pslli_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xf0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_pslli_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllq $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xf0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_pslli_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_pslli_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xf0,0x07]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_pslli_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllq $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xf0,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -869,29 +1417,42 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0) {
 ; CHECK-LABEL: test_x86_avx512_psrli_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xd0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrli_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrli_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrli_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrld $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrli_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrli_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xd0,0x07]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrli_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrld $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xd0,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -901,29 +1462,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0) {
 ; CHECK-LABEL: test_x86_avx512_psrli_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x73,0xd0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrli_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrli_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xd0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrli_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlq $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xd0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrli_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrli_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xd0,0x07]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrli_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlq $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xd0,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -933,29 +1509,42 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0) {
 ; CHECK-LABEL: test_x86_avx512_psrai_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0x72,0xe0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrai_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrai_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xe0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrai_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrad $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xe0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrai_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrai_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xe0,0x07]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrai_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrad $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xe0,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -965,29 +1554,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0) {
 ; CHECK-LABEL: test_x86_avx512_psrai_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x72,0xe0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrai_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrai_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x72,0xe0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrai_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsraq $7, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x72,0xe0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrai_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrai_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x72,0xe0,0x07]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrai_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsraq $7, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x72,0xe0,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -997,11 +1601,18 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>)
 
 define void at test_storent_q_512(<8 x i64> %data, i8* %ptr) {
-; CHECK-LABEL: test_storent_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_storent_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovntps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_storent_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovntps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
   ret void
 }
@@ -1009,11 +1620,18 @@ define void at test_storent_q_512(<8 x i64>
 declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>)
 
 define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) {
-; CHECK-LABEL: test_storent_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_storent_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovntps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_storent_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovntps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
   ret void
 }
@@ -1021,11 +1639,18 @@ define void @test_storent_pd_512(<8 x do
 declare void @llvm.x86.avx512.storent.ps.512(i8*, <16 x float>)
 
 define void @test_storent_ps_512(<16 x float> %data, i8* %ptr) {
-; CHECK-LABEL: test_storent_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_storent_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovntps %zmm0, (%eax) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x00]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_storent_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovntps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
   ret void
 }
@@ -1033,19 +1658,26 @@ define void @test_storent_ps_512(<16 x f
 define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_xor_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_xor_epi32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_xor_epi32:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xef,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_xor_epi32:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpxord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xef,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
@@ -1055,19 +1687,26 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_or_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_or_epi32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_or_epi32:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xeb,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_or_epi32:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpord %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xeb,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
@@ -1077,19 +1716,26 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_and_epi32:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_and_epi32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_and_epi32:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xdb,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_and_epi32:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpandd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xdb,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
@@ -1099,19 +1745,27 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_xor_epi64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xef,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_xor_epi64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_xor_epi64:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xef,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_xor_epi64:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpxorq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xef,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
@@ -1121,19 +1775,27 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_or_epi64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xeb,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_or_epi64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_or_epi64:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xeb,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_or_epi64:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vporq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xeb,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
@@ -1143,19 +1805,27 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_and_epi64:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xdb,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_and_epi64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_and_epi64:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xdb,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_and_epi64:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpandq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xdb,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
@@ -1165,71 +1835,111 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_mask_add_epi32_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_add_epi32_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpaddd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpaddd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_add_epi32_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_add_epi32_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpaddd (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_add_epi32_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddd (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfe,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_add_epi32_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddd (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfe,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
-; CHECK-LABEL: test_mask_add_epi32_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpaddd (%eax){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x58,0xfe,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x58,0xfe,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1238,12 +1948,20 @@ define <16 x i32> @test_mask_add_epi32_r
 }
 
 define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_add_epi32_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddd (%eax){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfe,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfe,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1252,11 +1970,18 @@ define <16 x i32> @test_mask_add_epi32_r
 }
 
 define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_add_epi32_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi32_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddd (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfe,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi32_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfe,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1269,71 +1994,111 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_mask_sub_epi32_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_sub_epi32_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsubd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsubd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_sub_epi32_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsubd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_sub_epi32_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsubd (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsubd (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfa,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_sub_epi32_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubd (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubd (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfa,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_sub_epi32_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubd (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubd (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xfa,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, i32* %ptr_b) {
-; CHECK-LABEL: test_mask_sub_epi32_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsubd (%eax){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x58,0xfa,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x58,0xfa,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1342,12 +2107,20 @@ define <16 x i32> @test_mask_sub_epi32_r
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_sub_epi32_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubd (%eax){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfa,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x59,0xfa,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1356,11 +2129,18 @@ define <16 x i32> @test_mask_sub_epi32_r
 }
 
 define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_sub_epi32_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi32_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubd (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfa,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi32_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubd (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xd9,0xfa,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1373,71 +2153,118 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_mask_add_epi64_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_add_epi64_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpaddq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_add_epi64_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
-; CHECK-LABEL: test_mask_add_epi64_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpaddq (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_add_epi64_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpaddq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_add_epi64_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpaddq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
-; CHECK-LABEL: test_mask_add_epi64_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xd4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1446,12 +2273,24 @@ define <8 x i64> @test_mask_add_epi64_rm
 }
 
 define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_add_epi64_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
+; X86-NEXT:    ## xmm2 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xca]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1460,11 +2299,22 @@ define <8 x i64> @test_mask_add_epi64_rm
 }
 
 define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_add_epi64_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_add_epi64_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_add_epi64_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1477,71 +2327,118 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_mask_sub_epi64_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_sub_epi64_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsubq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_sub_epi64_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, <8 x i64>* %ptr_b) {
-; CHECK-LABEL: test_mask_sub_epi64_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsubq (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_sub_epi64_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpsubq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_sub_epi64_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpsubq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) {
-; CHECK-LABEL: test_mask_sub_epi64_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xfb,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1550,12 +2447,24 @@ define <8 x i64> @test_mask_sub_epi64_rm
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_sub_epi64_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
+; X86-NEXT:    ## xmm2 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsubq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xca]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1564,11 +2473,22 @@ define <8 x i64> @test_mask_sub_epi64_rm
 }
 
 define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_sub_epi64_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_sub_epi64_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_sub_epi64_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsubq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -1581,71 +2501,111 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_mask_mullo_epi32_rr_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x40,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_mullo_epi32_rrk_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rrk_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulld %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x40,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rrk_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmulld %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x40,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_mullo_epi32_rrkz_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rrkz_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x40,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rrkz_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmulld %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x40,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_mullo_epi32_rm_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rm_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmulld (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x40,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rm_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmulld (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x40,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_mullo_epi32_rmk_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rmk_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpmulld (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x40,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rmk_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmulld (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x40,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_mullo_epi32_rmkz_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rmkz_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpmulld (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x40,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rmkz_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmulld (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x40,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
   ret < 16 x i32> %res
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-; CHECK-LABEL: test_mask_mullo_epi32_rmb_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rmb_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmulld (%eax){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x58,0x40,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rmb_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x58,0x40,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1654,12 +2614,20 @@ define <16 x i32> @test_mask_mullo_epi32
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <16 x i32> %passThru, i16 %mask) {
-; CHECK-LABEL: test_mask_mullo_epi32_rmbk_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rmbk_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpmulld (%eax){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x59,0x40,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rmbk_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x59,0x40,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1668,11 +2636,18 @@ define <16 x i32> @test_mask_mullo_epi32
 }
 
 define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i16 %mask) {
-; CHECK-LABEL: test_mask_mullo_epi32_rmbkz_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mullo_epi32_rmbkz_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpmulld (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xd9,0x40,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mullo_epi32_rmbkz_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmulld (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xd9,0x40,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1686,13 +2661,25 @@ declare <16 x i32> @llvm.x86.avx512.mask
 declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
+; X86:       ## %bb.0:
+; X86-NEXT:    vshuff32x4 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x23,0xd9,0x16]
+; X86-NEXT:    ## zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x23,0xd1,0x16]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X86-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_shuf_f32x4:
+; X64:       ## %bb.0:
+; X64-NEXT:    vshuff32x4 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x23,0xd9,0x16]
+; X64-NEXT:    ## zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x23,0xd1,0x16]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X64-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -1702,15 +2689,32 @@ define <16 x float>@test_int_x86_avx512_
 declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm2
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
+; X86:       ## %bb.0:
+; X86-NEXT:    vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16]
+; X86-NEXT:    ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x23,0xd1,0x16]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X86-NEXT:    vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3]
+; X86-NEXT:    vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X86-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
+; X64:       ## %bb.0:
+; X64-NEXT:    vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16]
+; X64-NEXT:    ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x23,0xd1,0x16]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X64-NEXT:    vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3]
+; X64-NEXT:    vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X64-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
@@ -1723,13 +2727,25 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
+; X86:       ## %bb.0:
+; X86-NEXT:    vshufi32x4 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x43,0xd9,0x16]
+; X86-NEXT:    ## zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x43,0xd1,0x16]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_shuf_i32x4:
+; X64:       ## %bb.0:
+; X64-NEXT:    vshufi32x4 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x43,0xd9,0x16]
+; X64-NEXT:    ## zmm3 = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x43,0xd1,0x16]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1739,13 +2755,26 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
+; X86:       ## %bb.0:
+; X86-NEXT:    vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16]
+; X86-NEXT:    ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x43,0xd1,0x16]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_shuf_i64x2:
+; X64:       ## %bb.0:
+; X64-NEXT:    vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16]
+; X64-NEXT:    ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x43,0xd1,0x16]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -1755,15 +2784,32 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm2
-; CHECK-NEXT:    vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vshufpd $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xd9,0x16]
+; X86-NEXT:    ## zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vshufpd $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xd1,0x16]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3]
+; X86-NEXT:    vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; X86-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_shuf_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vshufpd $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xd9,0x16]
+; X64-NEXT:    ## zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vshufpd $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xd1,0x16]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3]
+; X64-NEXT:    vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6]
+; X64-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
@@ -1776,13 +2822,25 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vshufps {{.*#+}} zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vshufps {{.*#+}} zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vshufps $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xd9,0x16]
+; X86-NEXT:    ## zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vshufps $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xd1,0x16]
+; X86-NEXT:    ## zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
+; X86-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_shuf_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vshufps $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xd9,0x16]
+; X64-NEXT:    ## zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vshufps $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xd1,0x16]
+; X64-NEXT:    ## zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12]
+; X64-NEXT:    vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -1792,13 +2850,21 @@ define <16 x float>@test_int_x86_avx512_
 declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3d,0xd1]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxs_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3d,0xd1]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1808,13 +2874,22 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xd9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3d,0xd1]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxs_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmaxsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3d,0xd1]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -1824,13 +2899,21 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmaxud %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmaxud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaxud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3f,0xd1]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmaxud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmaxud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3f,0xd1]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1840,13 +2923,22 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xd9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3f,0xd1]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxu_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmaxuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3f,0xd1]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -1856,13 +2948,21 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmins_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpminsd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmins_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpminsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpminsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x39,0xd1]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmins_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpminsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpminsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x39,0xd1]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1872,13 +2972,22 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmins_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmins_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xd9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x39,0xd1]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmins_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpminsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x39,0xd1]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -1888,13 +2997,21 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pminu_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpminud %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pminu_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpminud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpminud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3b,0xd1]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pminu_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpminud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpminud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3b,0xd1]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -1904,13 +3021,22 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pminu_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pminu_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xd9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3b,0xd1]
+; X86-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pminu_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpminuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3b,0xd1]
+; X64-NEXT:    vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -1918,11 +3044,18 @@ define <8 x i64>@test_int_x86_avx512_mas
 }
 
 define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
-; CHECK-LABEL: test_mm_mask_move_ss:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mm_mask_move_ss:
+; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x10,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_move_ss:
+; X64:       ## %bb.0: ## %entry
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0x76,0x09,0x10,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
 entry:
   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U)
   ret <4 x float> %res
@@ -1930,33 +3063,54 @@ entry:
 
 
 define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
-; CHECK-LABEL: test_mm_maskz_move_ss:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mm_maskz_move_ss:
+; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_move_ss:
+; X64:       ## %bb.0: ## %entry
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
 entry:
   %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U)
   ret <4 x float> %res
 }
 
 define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
-; CHECK-LABEL: test_mm_mask_move_sd:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mm_mask_move_sd:
+; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x10,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mm_mask_move_sd:
+; X64:       ## %bb.0: ## %entry
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf1,0xf7,0x09,0x10,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
 entry:
   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U)
   ret <2 x double> %res
 }
 
 define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
-; CHECK-LABEL: test_mm_maskz_move_sd:
-; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mm_maskz_move_sd:
+; X86:       ## %bb.0: ## %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mm_maskz_move_sd:
+; X64:       ## %bb.0: ## %entry
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
 entry:
   %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U)
   ret <2 x double> %res
@@ -1968,15 +3122,31 @@ declare <2 x double> @llvm.x86.avx512.ma
 declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovzxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xd0]
+; X86-NEXT:    ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovzxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x31,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; X86-NEXT:    vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovzxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xd0]
+; X64-NEXT:    ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovzxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x31,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; X64-NEXT:    vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -1988,15 +3158,32 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovzxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xd0]
+; X86-NEXT:    ## zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmovzxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x32,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovzxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xd0]
+; X64-NEXT:    ## zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovzxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x32,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -2008,15 +3195,32 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovzxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xd0]
+; X86-NEXT:    ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmovzxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x35,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X86-NEXT:    vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovzxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xd0]
+; X64-NEXT:    ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovzxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x35,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-NEXT:    vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -2028,15 +3232,31 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovzxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xd0]
+; X86-NEXT:    ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovzxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x33,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X86-NEXT:    vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovzxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xd0]
+; X64-NEXT:    ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovzxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x33,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X64-NEXT:    vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -2048,15 +3268,32 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovzxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd0]
+; X86-NEXT:    ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmovzxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x34,0xc8]
+; X86-NEXT:    ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X86-NEXT:    vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0]
+; X86-NEXT:    ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovzxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd0]
+; X64-NEXT:    ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovzxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x34,0xc8]
+; X64-NEXT:    ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X64-NEXT:    vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0]
+; X64-NEXT:    ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -2068,15 +3305,25 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovsxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xd0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x21,0xc8]
+; X86-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0]
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovsxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x21,0xc8]
+; X64-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0]
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
@@ -2088,15 +3335,26 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovsxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x22,0xc8]
+; X86-NEXT:    vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0]
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovsxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x22,0xc8]
+; X64-NEXT:    vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0]
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
@@ -2108,15 +3366,26 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovsxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x25,0xc8]
+; X86-NEXT:    vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0]
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovsxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x25,0xc8]
+; X64-NEXT:    vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0]
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
@@ -2129,15 +3398,25 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovsxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xd0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x23,0xc8]
+; X86-NEXT:    vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0]
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovsxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x23,0xc8]
+; X64-NEXT:    vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0]
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
@@ -2150,15 +3429,26 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpmovsxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x24,0xc8]
+; X86-NEXT:    vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0]
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmovsxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x24,0xc8]
+; X64-NEXT:    vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0]
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
@@ -2170,15 +3460,26 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpsrlq $4, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xd0,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xd0,0x04]
+; X86-NEXT:    vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xd0,0x04]
+; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psrl_qi_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsrlq $4, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xd0,0x04]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xd0,0x04]
+; X64-NEXT:    vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xd0,0x04]
+; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 -1)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
@@ -2190,15 +3491,25 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psrl_di_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_psrl_di_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpsrld $4, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xd0,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0x04]
+; X86-NEXT:    vpsrld $4, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xd0,0x04]
+; X86-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psrl_di_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsrld $4, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xd0,0x04]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xd0,0x04]
+; X64-NEXT:    vpsrld $4, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xd0,0x04]
+; X64-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 -1)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
@@ -2210,15 +3521,25 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psra_di_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_psra_di_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpsrad $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xe0,0x03]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsrad $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xe0,0x03]
+; X86-NEXT:    vpsrad $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xe0,0x03]
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psra_di_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsrad $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xe0,0x03]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsrad $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xe0,0x03]
+; X64-NEXT:    vpsrad $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xe0,0x03]
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
@@ -2230,15 +3551,26 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psra_qi_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_psra_qi_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpsraq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x72,0xe0,0x03]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsraq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x72,0xe0,0x03]
+; X86-NEXT:    vpsraq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x72,0xe0,0x03]
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psra_qi_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsraq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x72,0xe0,0x03]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsraq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x72,0xe0,0x03]
+; X64-NEXT:    vpsraq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x72,0xe0,0x03]
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
@@ -2250,15 +3582,25 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psll_di_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpslld $3, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpslld $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpslld $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_psll_di_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpslld $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xf0,0x03]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpslld $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xf0,0x03]
+; X86-NEXT:    vpslld $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xf0,0x03]
+; X86-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psll_di_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpslld $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x6d,0x48,0x72,0xf0,0x03]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpslld $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x72,0xf0,0x03]
+; X64-NEXT:    vpslld $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x72,0xf0,0x03]
+; X64-NEXT:    vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
@@ -2270,15 +3612,26 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_psll_qi_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_psll_qi_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpsllq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xf0,0x03]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsllq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xf0,0x03]
+; X86-NEXT:    vpsllq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xf0,0x03]
+; X86-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psll_qi_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsllq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xf0,0x03]
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpsllq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x49,0x73,0xf0,0x03]
+; X64-NEXT:    vpsllq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x73,0xf0,0x03]
+; X64-NEXT:    vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
@@ -2290,29 +3643,42 @@ define <8 x i64>@test_int_x86_avx512_mas
 define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psll_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xf2,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psll_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psll_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf2,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psll_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpslld %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf2,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psll_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psll_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf2,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psll_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpslld %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xf2,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -2322,29 +3688,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psll_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf3,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psll_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psll_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf3,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psll_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllq %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf3,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psll_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psll_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf3,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psll_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllq %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf3,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -2354,29 +3735,42 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psrl_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xd2,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrl_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrl_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd2,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrl_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrld %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd2,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrl_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrl_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd2,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrl_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrld %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xd2,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -2386,29 +3780,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psrl_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd3,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrl_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrl_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd3,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrl_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlq %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd3,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrl_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrl_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd3,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrl_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd3,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -2418,29 +3827,42 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psra_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xe2,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psra_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psra_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xe2,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psra_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrad %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xe2,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psra_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psra_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xe2,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psra_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrad %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0xe2,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -2450,29 +3872,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psra_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xe2,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psra_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psra_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xe2,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psra_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsraq %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xe2,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psra_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psra_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xe2,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psra_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsraq %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xe2,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -2482,29 +3919,42 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psllv_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x47,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psllv_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psllv_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x47,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psllv_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllvd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x47,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psllv_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psllv_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x47,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psllv_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x47,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -2514,29 +3964,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psllv_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x47,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psllv_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psllv_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x47,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psllv_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllvq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x47,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psllv_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psllv_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x47,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psllv_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x47,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -2547,29 +4012,42 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psrav_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x46,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrav_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrav_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x46,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrav_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsravd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x46,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrav_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrav_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x46,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrav_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsravd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x46,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -2579,29 +4057,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psrav_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x46,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrav_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrav_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x46,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrav_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsravq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x46,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrav_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrav_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x46,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrav_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsravq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x46,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -2611,29 +4104,42 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psrlv_d:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x45,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrlv_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrlv_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x45,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrlv_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x45,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrlv_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrlv_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x45,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrlv_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x45,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -2643,29 +4149,44 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-LABEL: test_x86_avx512_psrlv_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x45,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_mask_psrlv_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mask_psrlv_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x45,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrlv_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x45,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_x86_avx512_maskz_psrlv_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_maskz_psrlv_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x45,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrlv_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x45,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
   ret <8 x i64> %res
 }
@@ -2673,10 +4194,16 @@ define <8 x i64> @test_x86_avx512_maskz_
 declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
 
 define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) {
-; CHECK-LABEL: test_x86_avx512_psrlv_q_memop:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_psrlv_q_memop:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsrlvq (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x45,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_psrlv_q_memop:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpsrlvq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x45,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
@@ -2685,13 +4212,22 @@ define <8 x i64> @test_x86_avx512_psrlv_
 declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vcvtdq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8]
+; X86-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vcvtdq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8]
+; X64-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
   %res2 = fadd <8 x double> %res, %res1
@@ -2701,13 +4237,22 @@ define <8 x double>@test_int_x86_avx512_
 declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vcvtudq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8]
+; X86-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vcvtudq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8]
+; X64-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
   %res2 = fadd <8 x double> %res, %res1
@@ -2717,19 +4262,30 @@ define <8 x double>@test_int_x86_avx512_
 define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK-LABEL: test_valign_q:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    valignq {{.*#+}} zmm0 = zmm1[2,3,4,5,6,7],zmm0[0,1]
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    valignq $2, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x03,0xc1,0x02]
+; CHECK-NEXT:    ## zmm0 = zmm1[2,3,4,5,6,7],zmm0[0,1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
 
 define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask) {
-; CHECK-LABEL: test_mask_valign_q:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    valignq {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7],zmm0[0,1]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_valign_q:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    valignq $2, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x03,0xd1,0x02]
+; X86-NEXT:    ## zmm2 {%k1} = zmm1[2,3,4,5,6,7],zmm0[0,1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_valign_q:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    valignq $2, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x03,0xd1,0x02]
+; X64-NEXT:    ## zmm2 {%k1} = zmm1[2,3,4,5,6,7],zmm0[0,1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
   ret <8 x i64> %res
 }
@@ -2737,11 +4293,19 @@ define <8 x i64> @test_mask_valign_q(<8
 declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
 
 define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_maskz_valign_d:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4]
-; CHECK-NEXT:    retq
+; X86-LABEL: test_maskz_valign_d:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05]
+; X86-NEXT:    ## zmm0 {%k1} {z} = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_maskz_valign_d:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    valignd $5, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x03,0xc1,0x05]
+; X64-NEXT:    ## zmm0 {%k1} {z} = zmm1[5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1,2,3,4]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
@@ -2751,15 +4315,26 @@ declare <16 x i32> @llvm.x86.avx512.mask
 declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vaddpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xd9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x0d,0xd1]
+; X86-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1]
+; X86-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X86-NEXT:    vaddpd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermilpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermilpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x0d,0xd1]
+; X64-NEXT:    vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1]
+; X64-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X64-NEXT:    vaddpd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -2771,15 +4346,25 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermilps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0xd1]
+; X86-NEXT:    vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1]
+; X86-NEXT:    vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0]
+; X86-NEXT:    vaddps %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermilps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xd9]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermilps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0xd1]
+; X64-NEXT:    vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1]
+; X64-NEXT:    vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0]
+; X64-NEXT:    vaddps %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -2790,15 +4375,37 @@ define <16 x float>@test_int_x86_avx512_
 
 ; Test case to make sure we can print shuffle decode comments for constant pool loads.
 define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
-; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm1
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
-; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool:
+; X86:       ## %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
+; X86-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A]
+; X86-NEXT:    ## fixup A - offset: 6, value: LCPI203_0, kind: FK_Data_4
+; X86-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; X86-NEXT:    ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A]
+; X86-NEXT:    ## fixup A - offset: 6, value: LCPI203_1, kind: FK_Data_4
+; X86-NEXT:    vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9]
+; X86-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
+; X86-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A]
+; X86-NEXT:    ## fixup A - offset: 6, value: LCPI203_2, kind: FK_Data_4
+; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15]
+; X64-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A]
+; X64-NEXT:    ## fixup A - offset: 6, value: LCPI203_0-4, kind: reloc_riprel_4byte
+; X64-NEXT:    vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15]
+; X64-NEXT:    ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A]
+; X64-NEXT:    ## fixup A - offset: 6, value: LCPI203_1-4, kind: reloc_riprel_4byte
+; X64-NEXT:    vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9]
+; X64-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12]
+; X64-NEXT:    ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A]
+; X64-NEXT:    ## fixup A - offset: 6, value: LCPI203_2-4, kind: reloc_riprel_4byte
+; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> zeroinitializer, i16 %x3)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>, <16 x float> %x2, i16 -1)
@@ -2810,71 +4417,118 @@ define <16 x float>@test_int_x86_avx512_
 define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_mask_mul_epi32_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epi32_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epi32_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_mul_epi32_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epi32_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epi32_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
-; CHECK-LABEL: test_mask_mul_epi32_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x58,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2884,12 +4538,24 @@ define <8 x i64> @test_mask_mul_epi32_rm
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epi32_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
+; X86-NEXT:    ## xmm2 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xca]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2899,11 +4565,22 @@ define <8 x i64> @test_mask_mul_epi32_rm
 }
 
 define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epi32_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epi32_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epi32_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2917,71 +4594,118 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_mask_mul_epu32_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epu32_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epu32_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mask_mul_epu32_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epu32_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epu32_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
-; CHECK-LABEL: test_mask_mul_epu32_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -2991,12 +4715,24 @@ define <8 x i64> @test_mask_mul_epu32_rm
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epu32_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
+; X86-NEXT:    ## xmm2 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -3006,11 +4742,22 @@ define <8 x i64> @test_mask_mul_epu32_rm
 }
 
 define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mask_mul_epu32_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_mul_epu32_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_mul_epu32_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -3022,12 +4769,20 @@ define <8 x i64> @test_mask_mul_epu32_rm
 declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
 
 define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask) {
-; CHECK-LABEL: test_mask_vextractf32x4:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_vextractf32x4:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x19,0xc8,0x02]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vextractf32x4:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x19,0xc8,0x02]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
   ret <4 x float> %res
 }
@@ -3035,11 +4790,18 @@ define <4 x float> @test_mask_vextractf3
 declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
 
 define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask) {
-; CHECK-LABEL: test_mask_vextracti64x4:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_vextracti64x4:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vextracti64x4 $1, %zmm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3b,0xc8,0x01]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_vextracti64x4:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vextracti64x4 $1, %zmm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3b,0xc8,0x01]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask)
   ret <4 x i64> %res
 }
@@ -3047,12 +4809,20 @@ define <4 x i64> @test_mask_vextracti64x
 declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
 
 define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask) {
-; CHECK-LABEL: test_maskz_vextracti32x4:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_maskz_vextracti32x4:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x39,0xc0,0x02]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_maskz_vextracti32x4:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x39,0xc0,0x02]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
   ret <4 x i32> %res
 }
@@ -3062,8 +4832,8 @@ declare <4 x i32> @llvm.x86.avx512.mask.
 define <4 x double> @test_vextractf64x4(<8 x double> %a) {
 ; CHECK-LABEL: test_vextractf64x4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc0,0x01]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1)
   ret <4 x double> %res
 }
@@ -3073,15 +4843,25 @@ declare <4 x double> @llvm.x86.avx512.ma
 declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm2
-; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x18,0xd1,0x01]
+; X86-NEXT:    vaddps %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xd3]
+; X86-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01]
+; X86-NEXT:    vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_insertf32x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x18,0xd1,0x01]
+; X64-NEXT:    vaddps %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xd3]
+; X64-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01]
+; X64-NEXT:    vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
@@ -3093,15 +4873,25 @@ define <16 x float>@test_int_x86_avx512_
 declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x38,0xd1,0x01]
+; X86-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01]
+; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_inserti32x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x38,0xd1,0x01]
+; X64-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01]
+; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
@@ -3113,15 +4903,26 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm2
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xd1,0x01]
+; X86-NEXT:    vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3]
+; X86-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01]
+; X86-NEXT:    vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_insertf64x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xd1,0x01]
+; X64-NEXT:    vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3]
+; X64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01]
+; X64-NEXT:    vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
@@ -3133,15 +4934,26 @@ define <8 x double>@test_int_x86_avx512_
 declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xd1,0x01]
+; X86-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01]
+; X86-NEXT:    vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0]
+; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_inserti64x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xd1,0x01]
+; X64-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01]
+; X64-NEXT:    vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0]
+; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
@@ -3151,10 +4963,16 @@ define <8 x i64>@test_int_x86_avx512_mas
 }
 
 define <8 x i64> @test_x86_avx512_movntdqa(i8* %a0) {
-; CHECK-LABEL: test_x86_avx512_movntdqa:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovntdqa (%rdi), %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_movntdqa:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovntdqa (%eax), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x2a,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_movntdqa:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovntdqa (%rdi), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x2a,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.movntdqa(i8* %a0)
   ret <8 x i64> %res
 }
@@ -3164,29 +4982,30 @@ declare <8 x i64> @llvm.x86.avx512.movnt
 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: test_cmp_d_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
-; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k5
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf1,0x75,0x48,0x66,0xc8]
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04]
+; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xe1,0x05]
+; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf1,0x7d,0x48,0x66,0xe9]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
+; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; CHECK-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
@@ -3207,31 +5026,58 @@ define <8 x i16> @test_cmp_d_512(<16 x i
 }
 
 define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_mask_cmp_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k1}
-; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k3 {%k1}
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1}
-; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k5 {%k1}
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_cmp_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1]
+; X86-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0]
+; X86-NEXT:    vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04]
+; X86-NEXT:    vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05]
+; X86-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X86-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_cmp_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1]
+; X64-NEXT:    vpcmpgtd %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf1,0x75,0x49,0x66,0xd0]
+; X64-NEXT:    vpcmpled %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xd9,0x02]
+; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04]
+; X64-NEXT:    vpcmpnltd %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe9,0x05]
+; X64-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x66,0xc9]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X64-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
@@ -3256,29 +5102,30 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.
 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: test_ucmp_d_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k5
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x48,0x76,0xc1]
+; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xd9,0x04]
+; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe1,0x05]
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x48,0x1e,0xe9,0x06]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf1,0x76,0xc9]
+; CHECK-NEXT:    vpblendw $128, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0x80]
+; CHECK-NEXT:    ## xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
@@ -3299,31 +5146,58 @@ define <8 x i16> @test_ucmp_d_512(<16 x
 }
 
 define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
-; CHECK-LABEL: test_mask_ucmp_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k2 {%k1}
-; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k3 {%k1}
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1}
-; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k5 {%k1}
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_ucmp_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1]
+; X86-NEXT:    vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01]
+; X86-NEXT:    vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02]
+; X86-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04]
+; X86-NEXT:    vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05]
+; X86-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X86-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_ucmp_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x76,0xc1]
+; X64-NEXT:    vpcmpltud %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd1,0x01]
+; X64-NEXT:    vpcmpleud %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xd9,0x02]
+; X64-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1f,0xe1,0x04]
+; X64-NEXT:    vpcmpnltud %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xe9,0x05]
+; X64-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x1e,0xc9,0x06]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X64-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
   %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
@@ -3348,29 +5222,29 @@ declare i16 @llvm.x86.avx512.mask.ucmp.d
 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-LABEL: test_cmp_q_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1
-; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k5
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    movl $255, %eax
-; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k1 ## encoding: [0x62,0xf2,0xf5,0x48,0x37,0xc8]
+; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd9,0x04]
+; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xe1,0x05]
+; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf2,0xfd,0x48,0x37,0xe9]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
@@ -3391,31 +5265,58 @@ define <8 x i8> @test_cmp_q_512(<8 x i64
 }
 
 define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_cmp_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    vpcmpgtq %zmm0, %zmm1, %k2 {%k1}
-; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k3 {%k1}
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4 {%k1}
-; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k5 {%k1}
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_cmp_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
+; X86-NEXT:    vpcmpgtq %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x37,0xd0]
+; X86-NEXT:    vpcmpleq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x02]
+; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe1,0x04]
+; X86-NEXT:    vpcmpnltq %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe9,0x05]
+; X86-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X86-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_cmp_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
+; X64-NEXT:    vpcmpgtq %zmm0, %zmm1, %k2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x37,0xd0]
+; X64-NEXT:    vpcmpleq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xd9,0x02]
+; X64-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe1,0x04]
+; X64-NEXT:    vpcmpnltq %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe9,0x05]
+; X64-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x37,0xc9]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X64-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
@@ -3440,29 +5341,29 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.5
 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-LABEL: test_ucmp_q_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k5
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    movl $255, %eax
-; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x29,0xc1]
+; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1f,0xd9,0x04]
+; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xe1,0x05]
+; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x48,0x1e,0xe9,0x06]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; CHECK-NEXT:    movl $255, %eax ## encoding: [0xb8,0xff,0x00,0x00,0x00]
+; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
@@ -3483,31 +5384,58 @@ define <8 x i8> @test_ucmp_q_512(<8 x i6
 }
 
 define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-; CHECK-LABEL: test_mask_ucmp_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k2 {%k1}
-; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k3 {%k1}
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4 {%k1}
-; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k5 {%k1}
-; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
-; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
-; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
-; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mask_ucmp_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
+; X86-NEXT:    vpcmpltuq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd1,0x01]
+; X86-NEXT:    vpcmpleuq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd9,0x02]
+; X86-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe1,0x04]
+; X86-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xe9,0x05]
+; X86-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X86-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x00]
+; X86-NEXT:    kmovw %k2, %ecx ## encoding: [0xc5,0xf8,0x93,0xca]
+; X86-NEXT:    vpinsrw $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x01]
+; X86-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; X86-NEXT:    vpinsrw $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x02]
+; X86-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; X86-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x04]
+; X86-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
+; X86-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x05]
+; X86-NEXT:    kmovw %k1, %ecx ## encoding: [0xc5,0xf8,0x93,0xc9]
+; X86-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc1,0x06]
+; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mask_ucmp_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x29,0xc1]
+; X64-NEXT:    vpcmpltuq %zmm1, %zmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd1,0x01]
+; X64-NEXT:    vpcmpleuq %zmm1, %zmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xd9,0x02]
+; X64-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1f,0xe1,0x04]
+; X64-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xe9,0x05]
+; X64-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1e,0xc9,0x06]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xef,0xc0]
+; X64-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x00]
+; X64-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
+; X64-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
+; X64-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
+; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
+; X64-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
+; X64-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
+; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
   %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
@@ -3532,17 +5460,29 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.
 declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovaps %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01]
+; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmovaps %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0xc8]
+; X86-NEXT:    vmovaps %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xd0]
+; X86-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X86-NEXT:    vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01]
+; X64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovaps %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x28,0xc8]
+; X64-NEXT:    vmovaps %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xc9,0x28,0xd0]
+; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1]
+; X64-NEXT:    vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
@@ -3553,11 +5493,20 @@ define <16 x float>@test_int_x86_avx512_
 }
 
 define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float>* %x0ptr, <16 x float> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vbroadcastf32x4 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x1a,0x00]
+; X86-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x4_512_load:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vbroadcastf32x4 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x1a,0x07]
+; X64-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %x0 = load <4 x float>, <4 x float>* %x0ptr
   %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
   ret <16 x float> %res
@@ -3566,16 +5515,28 @@ define <16 x float>@test_int_x86_avx512_
 declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm1
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01]
+; X86-NEXT:    vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9]
+; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01]
+; X86-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01]
+; X64-NEXT:    vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9]
+; X64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01]
+; X64-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
@@ -3586,11 +5547,21 @@ define <8 x double>@test_int_x86_avx512_
 }
 
 define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vbroadcastf64x4 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x1b,0x00]
+; X86-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512_load:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vbroadcastf64x4 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x1b,0x07]
+; X64-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %x0 = load <4 x double>, <4 x double>* %x0ptr
   %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
@@ -3600,17 +5571,29 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
+; X86-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01]
+; X86-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0xc8]
+; X86-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0xd0]
+; X86-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xca]
+; X86-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    ## kill: def $xmm0 killed $xmm0 def $ymm0
+; X64-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01]
+; X64-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xc0,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0xc8]
+; X64-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0xd0]
+; X64-NEXT:    vpaddd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xca]
+; X64-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
@@ -3621,11 +5604,20 @@ define <16 x i32>@test_int_x86_avx512_ma
 }
 
 define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vbroadcasti32x4 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x5a,0x00]
+; X86-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x4_512_load:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vbroadcasti32x4 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x5a,0x07]
+; X64-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %x0 = load <4 x i32>, <4 x i32>* %x0ptr
   %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
@@ -3635,16 +5627,28 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
-; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; X86-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01]
+; X86-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01]
+; X86-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
+; X64-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01]
+; X64-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01]
+; X64-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
@@ -3655,11 +5659,21 @@ define <8 x i64>@test_int_x86_avx512_mas
 }
 
 define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
-; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vbroadcasti64x4 (%eax), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x5b,0x00]
+; X86-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512_load:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vbroadcasti64x4 (%rdi), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x5b,0x07]
+; X64-NEXT:    ## zmm0 {%k1} = mem[0,1,2,3,0,1,2,3]
+; X64-NEXT:    retq ## encoding: [0xc3]
 
   %x0 = load <4 x i64>, <4 x i64>* %x0ptr
   %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
@@ -3669,13 +5683,21 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpabsd %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpabsd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pabs_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpabsd %zmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xd0]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpabsd %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x1e,0xc8]
+; X86-NEXT:    vpaddd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pabs_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpabsd %zmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpabsd %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x1e,0xc8]
+; X64-NEXT:    vpaddd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -3685,13 +5707,22 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pabs_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpabsq %zmm0, %zmm2
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpabsq %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pabs_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpabsq %zmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xd0]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpabsq %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x1f,0xc8]
+; X86-NEXT:    vpaddq %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pabs_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpabsq %zmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xd0]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpabsq %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x1f,0xc8]
+; X64-NEXT:    vpaddq %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -3699,15 +5730,25 @@ define <8 x i64>@test_int_x86_avx512_mas
 }
 
 define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
-; CHECK-LABEL: test_vptestmq:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andb %al, %dil
-; CHECK-NEXT:    addb %dil, %al
-; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_vptestmq:
+; X86:       ## %bb.0:
+; X86-NEXT:    vptestmq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    andb %cl, %al ## encoding: [0x20,0xc8]
+; X86-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_vptestmq:
+; X64:       ## %bb.0:
+; X64-NEXT:    vptestmq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andb %al, %dil ## encoding: [0x40,0x20,0xc7]
+; X64-NEXT:    addb %dil, %al ## encoding: [0x40,0x00,0xf8]
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
   %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
   %res2 = add i8 %res1, %res
@@ -3716,15 +5757,26 @@ define i8 @test_vptestmq(<8 x i64> %a0,
 declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
 
 define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
-; CHECK-LABEL: test_vptestmd:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl %eax, %edi
-; CHECK-NEXT:    addl %edi, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_vptestmd:
+; X86:       ## %bb.0:
+; X86-NEXT:    vptestmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    andw %cx, %ax ## encoding: [0x66,0x21,0xc8]
+; X86-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_vptestmd:
+; X64:       ## %bb.0:
+; X64-NEXT:    vptestmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; X64-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
   %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
   %res2 = add i16 %res1, %res
@@ -3735,15 +5787,26 @@ declare i16 @llvm.x86.avx512.ptestm.d.51
 declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
 
 define i16 at test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl %eax, %edi
-; CHECK-NEXT:    addl %edi, %eax
-; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_ptestnm_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vptestnmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    andw %cx, %ax ## encoding: [0x66,0x21,0xc8]
+; X86-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_ptestnm_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vptestnmd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x48,0x27,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andl %eax, %edi ## encoding: [0x21,0xc7]
+; X64-NEXT:    addl %edi, %eax ## encoding: [0x01,0xf8]
+; X64-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
   %res2 = add i16 %res, %res1
@@ -3753,15 +5816,25 @@ define i16 at test_int_x86_avx512_ptestnm_d
 declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
 
 define i8 at test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andb %al, %dil
-; CHECK-NEXT:    addb %dil, %al
-; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_ptestnm_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vptestnmq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x48,0x27,0xc1]
+; X86-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al ## encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    andb %cl, %al ## encoding: [0x20,0xc8]
+; X86-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; X86-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_ptestnm_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vptestnmq %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x48,0x27,0xc1]
+; X64-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; X64-NEXT:    andb %al, %dil ## encoding: [0x40,0x20,0xc7]
+; X64-NEXT:    addb %dil, %al ## encoding: [0x40,0x00,0xf8]
+; X64-NEXT:    ## kill: def $al killed $al killed $eax
+; X64-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
   %res2 = add i8 %res, %res1
@@ -3770,12 +5843,20 @@ define i8 at test_int_x86_avx512_ptestnm_q_
 
 declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
 define i16 @test_kand(i16 %a0, i16 %a1) {
-; CHECK-LABEL: test_kand:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    andl $8, %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: test_kand:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08]
+; X86-NEXT:    andl $8, %eax ## encoding: [0x83,0xe0,0x08]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_kand:
+; X64:       ## %bb.0:
+; X64-NEXT:    andl %esi, %edi ## encoding: [0x21,0xf7]
+; X64-NEXT:    andl $8, %edi ## encoding: [0x83,0xe7,0x08]
+; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
   ret i16 %t2
@@ -3783,12 +5864,21 @@ define i16 @test_kand(i16 %a0, i16 %a1)
 
 declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone
 define i16 @test_kandn(i16 %a0, i16 %a1) {
-; CHECK-LABEL: test_kandn:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    orl $-9, %edi
-; CHECK-NEXT:    andl %esi, %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: test_kandn:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl $65527, %eax ## encoding: [0xb8,0xf7,0xff,0x00,0x00]
+; X86-NEXT:    ## imm = 0xFFF7
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax ## encoding: [0x0b,0x44,0x24,0x04]
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x23,0x44,0x24,0x08]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_kandn:
+; X64:       ## %bb.0:
+; X64-NEXT:    orl $-9, %edi ## encoding: [0x83,0xcf,0xf7]
+; X64-NEXT:    andl %esi, %edi ## encoding: [0x21,0xf7]
+; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1)
   ret i16 %t2
@@ -3796,23 +5886,38 @@ define i16 @test_kandn(i16 %a0, i16 %a1)
 
 declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
 define i16 @test_knot(i16 %a0) {
-; CHECK-LABEL: test_knot:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    notl %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: test_knot:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    notl %eax ## encoding: [0xf7,0xd0]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_knot:
+; X64:       ## %bb.0:
+; X64-NEXT:    notl %edi ## encoding: [0xf7,0xd7]
+; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
   ret i16 %res
 }
 
 declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone
 define i16 @test_kor(i16 %a0, i16 %a1) {
-; CHECK-LABEL: test_kor:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    orl $8, %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: test_kor:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    orw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x0b,0x44,0x24,0x08]
+; X86-NEXT:    orl $8, %eax ## encoding: [0x83,0xc8,0x08]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_kor:
+; X64:       ## %bb.0:
+; X64-NEXT:    orl %esi, %edi ## encoding: [0x09,0xf7]
+; X64-NEXT:    orl $8, %edi ## encoding: [0x83,0xcf,0x08]
+; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1)
   ret i16 %t2
@@ -3822,12 +5927,20 @@ declare i16 @llvm.x86.avx512.kxnor.w(i16
 ; TODO: the two kxnor instructions here a no op and should be elimintaed,
 ; probably by FoldConstantArithmetic in SelectionDAG.
 define i16 @test_kxnor(i16 %a0, i16 %a1) {
-; CHECK-LABEL: test_kxnor:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    xorl $8, %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: test_kxnor:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08]
+; X86-NEXT:    xorl $8, %eax ## encoding: [0x83,0xf0,0x08]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_kxnor:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorl %esi, %edi ## encoding: [0x31,0xf7]
+; X64-NEXT:    xorl $8, %edi ## encoding: [0x83,0xf7,0x08]
+; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1)
   ret i16 %t2
@@ -3835,12 +5948,20 @@ define i16 @test_kxnor(i16 %a0, i16 %a1)
 
 declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone
 define i16 @test_kxor(i16 %a0, i16 %a1) {
-; CHECK-LABEL: test_kxor:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    xorl $8, %edi
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    retq
+; X86-LABEL: test_kxor:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    xorw {{[0-9]+}}(%esp), %ax ## encoding: [0x66,0x33,0x44,0x24,0x08]
+; X86-NEXT:    xorl $8, %eax ## encoding: [0x83,0xf0,0x08]
+; X86-NEXT:    ## kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_kxor:
+; X64:       ## %bb.0:
+; X64-NEXT:    xorl %esi, %edi ## encoding: [0x31,0xf7]
+; X64-NEXT:    xorl $8, %edi ## encoding: [0x83,0xf7,0x08]
+; X64-NEXT:    movl %edi, %eax ## encoding: [0x89,0xf8]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1)
   ret i16 %t2
@@ -3850,13 +5971,13 @@ declare i32 @llvm.x86.avx512.kortestz.w(
 define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) {
 ; CHECK-LABEL: test_kortestz:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    kortestw %k1, %k0
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xcb,0x04]
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    kortestw %k1, %k0 ## encoding: [0xc5,0xf8,0x98,0xc1]
+; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 entry:
   %0 = bitcast <8 x i64> %A to <16 x i32>
   %1 = bitcast <8 x i64> %B to <16 x i32>
@@ -3874,13 +5995,13 @@ declare i32 @llvm.x86.avx512.kortestc.w(
 define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D) {
 ; CHECK-LABEL: test_kortestc:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    kortestw %k1, %k0
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x48,0x1f,0xc1,0x04]
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1 ## encoding: [0x62,0xf3,0x6d,0x48,0x1f,0xcb,0x04]
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    kortestw %k1, %k0 ## encoding: [0xc5,0xf8,0x98,0xc1]
+; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
 entry:
   %0 = bitcast <8 x i64> %A to <16 x i32>
   %1 = bitcast <8 x i64> %B to <16 x i32>
@@ -3897,11 +6018,11 @@ entry:
 define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: test_cmpps:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
   ret i16 %res
 }
@@ -3910,11 +6031,11 @@ declare i16 @llvm.x86.avx512.mask.cmp.ps
 define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
 ; CHECK-LABEL: test_cmppd:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc1,0x04]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
   ret i8 %res
 }
@@ -3923,19 +6044,27 @@ declare i8 @llvm.x86.avx512.mask.cmp.pd.
 define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_mul_epi32_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mul_epi32_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
   %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
@@ -3943,11 +6072,18 @@ define <8 x i64> @test_mul_epi32_rrk(<16
 }
 
 define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mul_epi32_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
   %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
@@ -3955,22 +6091,37 @@ define <8 x i64> @test_mul_epi32_rrkz(<1
 }
 
 define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mul_epi32_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mul_epi32_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -3979,11 +6130,19 @@ define <8 x i64> @test_mul_epi32_rmk(<16
 }
 
 define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mul_epi32_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuldq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -3992,10 +6151,19 @@ define <8 x i64> @test_mul_epi32_rmkz(<1
 }
 
 define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, i64* %ptr_b) {
-; CHECK-LABEL: test_mul_epi32_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x28,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x58,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -4005,12 +6173,24 @@ define <8 x i64> @test_mul_epi32_rmb(<16
 }
 
 define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mul_epi32_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
+; X86-NEXT:    ## xmm2 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x28,0xca]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x28,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -4022,11 +6202,22 @@ define <8 x i64> @test_mul_epi32_rmbk(<1
 }
 
 define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mul_epi32_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epi32_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuldq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x28,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epi32_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuldq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x28,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -4042,19 +6233,27 @@ declare <8 x i64> @llvm.x86.avx512.pmul.
 define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: test_mul_epu32_rr:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mul_epu32_rrk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rrk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rrk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
   %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
@@ -4062,11 +6261,18 @@ define <8 x i64> @test_mul_epu32_rrk(<16
 }
 
 define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask) {
-; CHECK-LABEL: test_mul_epu32_rrkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rrkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rrkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
   %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
@@ -4074,22 +6280,37 @@ define <8 x i64> @test_mul_epu32_rrkz(<1
 }
 
 define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; CHECK-LABEL: test_mul_epu32_rm:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rm:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rm:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
   ret < 8 x i64> %res
 }
 
 define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mul_epu32_rmk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rmk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rmk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -4098,11 +6319,19 @@ define <8 x i64> @test_mul_epu32_rmk(<16
 }
 
 define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, <16 x i32>* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mul_epu32_rmkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rmkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vpmuludq (%eax), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rmkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -4111,10 +6340,19 @@ define <8 x i64> @test_mul_epu32_rmkz(<1
 }
 
 define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, i64* %ptr_b) {
-; CHECK-LABEL: test_mul_epu32_rmb:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rmb:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rmb:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -4124,12 +6362,24 @@ define <8 x i64> @test_mul_epu32_rmb(<16
 }
 
 define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
-; CHECK-LABEL: test_mul_epu32_rmbk:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rmbk:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x10]
+; X86-NEXT:    ## xmm2 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd2]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xf4,0xca]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rmbk:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xf4,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -4141,11 +6391,22 @@ define <8 x i64> @test_mul_epu32_rmbk(<1
 }
 
 define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, i64* %ptr_b, i8 %mask) {
-; CHECK-LABEL: test_mul_epu32_rmbkz:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; X86-LABEL: test_mul_epu32_rmbkz:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vmovq (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x08]
+; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vpbroadcastq %xmm1, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xc9]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xf4,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_mul_epu32_rmbkz:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vpmuludq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xf4,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
   %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
   %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -4159,10 +6420,15 @@ define <8 x i64> @test_mul_epu32_rmbkz(<
 declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>)
 
 define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
-; CHECK-LABEL: test_x86_avx512_mm_cvtu32_sd:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_avx512_mm_cvtu32_sd:
+; X86:       ## %bb.0:
+; X86-NEXT:    vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x7b,0x44,0x24,0x01]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mm_cvtu32_sd:
+; X64:       ## %bb.0:
+; X64-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7f,0x08,0x7b,0xc7]
+; X64-NEXT:    retq ## encoding: [0xc3]
 {
   %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1]
   ret <2 x double> %res
@@ -4170,20 +6436,32 @@ define <2 x double> @test_x86_avx512_mm_
 declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone
 
 define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
-; CHECK-LABEL: test_x86_vbroadcast_ss_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_vbroadcast_ss_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vbroadcastss (%eax), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vbroadcast_ss_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vbroadcastss (%rdi), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8* %a0) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(i8*) nounwind readonly
 
 define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
-; CHECK-LABEL: test_x86_vbroadcast_sd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_x86_vbroadcast_sd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vbroadcastsd (%eax), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0x00]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_x86_vbroadcast_sd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vbroadcastsd (%rdi), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0x07]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8* %a0) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
@@ -4192,15 +6470,26 @@ declare <8 x double> @llvm.x86.avx512.vb
 declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x16,0xd0]
+; X86-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0]
+; X86-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X86-NEXT:    vaddpd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xd8]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x16,0xd0]
+; X64-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0]
+; X64-NEXT:    vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0]
+; X64-NEXT:    vaddpd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
   %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -4212,15 +6501,26 @@ define <8 x double>@test_int_x86_avx512_
 declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermq %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xd8]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x36,0xd0]
+; X86-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0]
+; X86-NEXT:    vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3]
+; X86-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermq %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xd8]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x36,0xd0]
+; X64-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0]
+; X64-NEXT:    vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3]
+; X64-NEXT:    vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
   %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -4232,15 +6532,25 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    vaddps %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermps %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xd8]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x16,0xd0]
+; X86-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0]
+; X86-NEXT:    vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0]
+; X86-NEXT:    vaddps %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xd8]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x16,0xd0]
+; X64-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0]
+; X64-NEXT:    vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0]
+; X64-NEXT:    vaddps %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
   %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -4252,15 +6562,25 @@ define <16 x float>@test_int_x86_avx512_
 declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vpermd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xd8]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x36,0xd0]
+; X86-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0]
+; X86-NEXT:    vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3]
+; X86-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vpermd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xd8]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x36,0xd0]
+; X64-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0]
+; X64-NEXT:    vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3]
+; X64-NEXT:    vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
   %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
@@ -4272,14 +6592,23 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21]
+; X86-NEXT:    vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X64-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21]
+; X64-NEXT:    vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -4289,14 +6618,23 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
 
 define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21]
+; X86-NEXT:    vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X64-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21]
+; X64-NEXT:    vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -4306,14 +6644,24 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21]
+; X86-NEXT:    vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X64-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21]
+; X64-NEXT:    vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -4323,14 +6671,24 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
 
 define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21]
+; X86-NEXT:    vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_pternlog_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X64-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xc9,0x25,0xc2,0x21]
+; X64-NEXT:    vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -4340,14 +6698,24 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
-; CHECK-NEXT:    vpermi2d (%rdi), %zmm0, %zmm3 {%k1}
-; CHECK-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X86-NEXT:    vpermi2d (%eax), %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x18]
+; X86-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2]
+; X86-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X64-NEXT:    vpermi2d (%rdi), %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x1f]
+; X64-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2]
+; X64-NEXT:    vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
   %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
@@ -4358,14 +6726,24 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovapd %zmm0, %zmm3
-; CHECK-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddpd %zmm3, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovapd %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd8]
+; X86-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xda]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x77,0xca]
+; X86-NEXT:    vaddpd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovapd %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd8]
+; X64-NEXT:    vpermt2pd %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xda]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x77,0xca]
+; X64-NEXT:    vaddpd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
   %res2 = fadd <8 x double> %res, %res1
@@ -4375,14 +6753,23 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovaps %zmm0, %zmm3
-; CHECK-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
+; X86-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xda]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x77,0xca]
+; X86-NEXT:    vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovaps %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8]
+; X64-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xda]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x77,0xca]
+; X64-NEXT:    vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -4392,14 +6779,24 @@ define <16 x float>@test_int_x86_avx512_
 declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xda]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x76,0xca]
+; X86-NEXT:    vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X64-NEXT:    vpermt2q %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xda]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x76,0xca]
+; X64-NEXT:    vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -4409,14 +6806,24 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm2
-; CHECK-NEXT:    vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1
-; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd1]
+; X86-NEXT:    vpermt2d (%eax), %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7e,0x10]
+; X86-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xc9]
+; X86-NEXT:    vpaddd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd1]
+; X64-NEXT:    vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7e,0x17]
+; X64-NEXT:    vpermt2d %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xc9]
+; X64-NEXT:    vpaddd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %x2 = load <16 x i32>, <16 x i32>* %x2p
   %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1)
@@ -4427,14 +6834,25 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
 
 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovapd %zmm1, %zmm2
-; CHECK-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1
-; CHECK-NEXT:    vaddpd %zmm1, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08]
+; X86-NEXT:    kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9]
+; X86-NEXT:    vmovapd %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd1]
+; X86-NEXT:    vpermt2pd (%eax){1to8}, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x7f,0x10]
+; X86-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7f,0xc9]
+; X86-NEXT:    vaddpd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc1]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; X64-NEXT:    vmovapd %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd1]
+; X64-NEXT:    vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x7f,0x17]
+; X64-NEXT:    vpermt2pd %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7f,0xc9]
+; X64-NEXT:    vaddpd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc1]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %x2s = load double, double* %x2ptr
   %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
   %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
@@ -4447,14 +6865,23 @@ define <8 x double>@test_int_x86_avx512_
 declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovaps %zmm1, %zmm3
-; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovaps %zmm1, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd9]
+; X86-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7f,0xda]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7f,0xca]
+; X86-NEXT:    vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovaps %zmm1, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd9]
+; X64-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7f,0xda]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7f,0xca]
+; X64-NEXT:    vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
   %res2 = fadd <16 x float> %res, %res1
@@ -4465,14 +6892,24 @@ define <16 x float>@test_int_x86_avx512_
 declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
-; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X86-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x7e,0xda]
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7e,0xca]
+; X86-NEXT:    vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X64-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x7e,0xda]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7e,0xca]
+; X64-NEXT:    vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   %res2 = add <8 x i64> %res, %res1
@@ -4482,14 +6919,23 @@ define <8 x i64>@test_int_x86_avx512_mas
 declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
-; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
+; X86:       ## %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X86-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xda]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7e,0xca]
+; X86-NEXT:    vpaddd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc3]
+; X86-NEXT:    retl ## encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512:
+; X64:       ## %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X64-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xda]
+; X64-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7e,0xca]
+; X64-NEXT:    vpaddd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc3]
+; X64-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1

Modified: llvm/trunk/test/CodeGen/X86/avx512-vpclmulqdq.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vpclmulqdq.ll?rev=333843&r1=333842&r2=333843&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vpclmulqdq.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vpclmulqdq.ll Sun Jun  3 07:56:04 2018
@@ -1,10 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefix=AVX512_VPCLMULQDQ
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+vpclmulqdq -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 define <8 x i64> @test_x86_pclmulqdq(<8 x i64> %a0, <8 x i64> %a1) {
-; AVX512_VPCLMULQDQ-LABEL: test_x86_pclmulqdq:
-; AVX512_VPCLMULQDQ:       # %bb.0:
-; AVX512_VPCLMULQDQ-NEXT:    vpclmulqdq $1, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x44,0xc1,0x01]
-; AVX512_VPCLMULQDQ-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: test_x86_pclmulqdq:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpclmulqdq $1, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x44,0xc1,0x01]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.pclmulqdq.512(<8 x i64> %a0, <8 x i64> %a1, i8 1)
   ret <8 x i64> %res
 }

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll?rev=333843&r1=333842&r2=333843&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-fast-isel.ll Sun Jun  3 07:56:04 2018
@@ -1,33 +1,33 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=CHECK,X64
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c
 
 define i64 @test_mm512_kunpackd(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackd:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
-; X32-NEXT:    vpcmpneqb 8(%ebp), %zmm2, %k1
-; X32-NEXT:    vpcmpneqb 72(%ebp), %zmm3, %k2
-; X32-NEXT:    kandd %k0, %k2, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    kshiftrq $32, %k2, %k0
-; X32-NEXT:    kandd %k1, %k0, %k0
-; X32-NEXT:    kmovd %k0, %edx
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa %esp, 4
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_kunpackd:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
+; X86-NEXT:    vpcmpneqb %zmm0, %zmm1, %k0
+; X86-NEXT:    vpcmpneqb 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpcmpneqb 72(%ebp), %zmm3, %k2
+; X86-NEXT:    kandd %k0, %k2, %k0
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    kshiftrq $32, %k2, %k0
+; X86-NEXT:    kandd %k1, %k0, %k0
+; X86-NEXT:    kmovd %k0, %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_kunpackd:
 ; X64:       # %bb.0: # %entry
@@ -57,26 +57,26 @@ entry:
 }
 
 define i32 @test_mm512_kunpackw(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) {
-; X32-LABEL: test_mm512_kunpackw:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    .cfi_def_cfa_offset 8
-; X32-NEXT:    .cfi_offset %ebp, -8
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    .cfi_def_cfa_register %ebp
-; X32-NEXT:    andl $-64, %esp
-; X32-NEXT:    subl $64, %esp
-; X32-NEXT:    vmovdqa64 136(%ebp), %zmm3
-; X32-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
-; X32-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
-; X32-NEXT:    kunpckwd %k0, %k1, %k1
-; X32-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
-; X32-NEXT:    .cfi_def_cfa %esp, 4
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_kunpackw:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vmovdqa64 136(%ebp), %zmm3
+; X86-NEXT:    vpcmpneqw %zmm0, %zmm1, %k0
+; X86-NEXT:    vpcmpneqw 8(%ebp), %zmm2, %k1
+; X86-NEXT:    kunpckwd %k0, %k1, %k1
+; X86-NEXT:    vpcmpneqw 72(%ebp), %zmm3, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_kunpackw:
 ; X64:       # %bb.0: # %entry
@@ -107,14 +107,14 @@ entry:
 
 
 define <8 x i64> @test_mm512_mask_set1_epi8(<8 x i64> %__O, i64 %__M, i8 signext %__A)  {
-; X32-LABEL: test_mm512_mask_set1_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kunpckdq %k1, %k0, %k1
-; X32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_set1_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kunpckdq %k1, %k0, %k1
+; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_set1_epi8:
 ; X64:       # %bb.0: # %entry
@@ -132,14 +132,14 @@ define <8 x i64> @test_mm512_mask_set1_e
 }
 
 define <8 x i64> @test_mm512_maskz_set1_epi8(i64 %__M, i8 signext %__A)  {
-; X32-LABEL: test_mm512_maskz_set1_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kunpckdq %k1, %k0, %k1
-; X32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_set1_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kunpckdq %k1, %k0, %k1
+; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_set1_epi8:
 ; X64:       # %bb.0: # %entry
@@ -156,12 +156,12 @@ define <8 x i64> @test_mm512_maskz_set1_
 }
 
 define <8 x i64> @test_mm512_mask_set1_epi16(<8 x i64> %__O, i32 %__M, i16 signext %__A)  {
-; X32-LABEL: test_mm512_mask_set1_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_set1_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_set1_epi16:
 ; X64:       # %bb.0: # %entry
@@ -179,12 +179,12 @@ define <8 x i64> @test_mm512_mask_set1_e
 }
 
 define <8 x i64> @test_mm512_maskz_set1_epi16(i32 %__M, i16 signext %__A)  {
-; X32-LABEL: test_mm512_maskz_set1_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_set1_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_set1_epi16:
 ; X64:       # %bb.0: # %entry
@@ -201,15 +201,10 @@ define <8 x i64> @test_mm512_maskz_set1_
 }
 
 define <8 x i64> @test_mm512_broadcastb_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm512_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpbroadcastb %xmm0, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_broadcastb_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastb %xmm0, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_broadcastb_epi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <64 x i32> zeroinitializer
   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
@@ -217,12 +212,12 @@ define <8 x i64> @test_mm512_broadcastb_
 }
 
 define <8 x i64> @test_mm512_mask_broadcastb_epi8(<8 x i64> %a0, i64* %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm512_mask_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovq (%eax), %k1
-; X32-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_broadcastb_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovq (%eax), %k1
+; X86-NEXT:    vpbroadcastb %xmm1, %zmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_broadcastb_epi8:
 ; X64:       # %bb.0:
@@ -240,12 +235,12 @@ define <8 x i64> @test_mm512_mask_broadc
 }
 
 define <8 x i64> @test_mm512_maskz_broadcastb_epi8(i64* %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm512_maskz_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovq (%eax), %k1
-; X32-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_broadcastb_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovq (%eax), %k1
+; X86-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_broadcastb_epi8:
 ; X64:       # %bb.0:
@@ -262,15 +257,10 @@ define <8 x i64> @test_mm512_maskz_broad
 }
 
 define <8 x i64> @test_mm512_broadcastw_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm512_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpbroadcastw %xmm0, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_broadcastw_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_broadcastw_epi16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <32 x i32> zeroinitializer
   %res1 = bitcast <32 x i16> %res0 to <8 x i64>
@@ -278,11 +268,11 @@ define <8 x i64> @test_mm512_broadcastw_
 }
 
 define <8 x i64> @test_mm512_mask_broadcastw_epi16(<8 x i64> %a0, i32 %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm512_mask_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_broadcastw_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %xmm1, %zmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_broadcastw_epi16:
 ; X64:       # %bb.0:
@@ -299,11 +289,11 @@ define <8 x i64> @test_mm512_mask_broadc
 }
 
 define <8 x i64> @test_mm512_maskz_broadcastw_epi16(i32 %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm512_maskz_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_broadcastw_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_broadcastw_epi16:
 ; X64:       # %bb.0:
@@ -319,15 +309,10 @@ define <8 x i64> @test_mm512_maskz_broad
 }
 
 define <8 x i64> @test_mm512_bslli_epi128(<8 x i64> %a0) {
-; X32-LABEL: test_mm512_bslli_epi128:
-; X32:       # %bb.0:
-; X32-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_bslli_epi128:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_bslli_epi128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
@@ -335,15 +320,10 @@ define <8 x i64> @test_mm512_bslli_epi12
 }
 
 define <8 x i64> @test_mm512_bsrli_epi128(<8 x i64> %a0) {
-; X32-LABEL: test_mm512_bsrli_epi128:
-; X32:       # %bb.0:
-; X32-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_bsrli_epi128:
-; X64:       # %bb.0:
-; X64-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_bsrli_epi128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zmm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zmm0[37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zmm0[53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
   %res1 = bitcast <64 x i8> %res0 to <8 x i64>
@@ -351,15 +331,10 @@ define <8 x i64> @test_mm512_bsrli_epi12
 }
 
 define <8 x i64> @test_mm512_unpackhi_epi8(<8 x i64> %a0, <8 x i64> %a1) {
-; X32-LABEL: test_mm512_unpackhi_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_unpackhi_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_unpackhi_epi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpckhbw {{.*#+}} zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
   %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
@@ -369,12 +344,12 @@ define <8 x i64> @test_mm512_unpackhi_ep
 
 ; TODO - improve support for i64 -> mmask64 on 32-bit targets
 define <8 x i64> @test_mm512_mask_unpackhi_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
-; X32-LABEL: test_mm512_mask_unpackhi_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovq (%eax), %k1
-; X32-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_unpackhi_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovq (%eax), %k1
+; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_unpackhi_epi8:
 ; X64:       # %bb.0:
@@ -393,12 +368,12 @@ define <8 x i64> @test_mm512_mask_unpack
 }
 
 define <8 x i64> @test_mm512_maskz_unpackhi_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
-; X32-LABEL: test_mm512_maskz_unpackhi_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovq (%eax), %k1
-; X32-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_unpackhi_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovq (%eax), %k1
+; X86-NEXT:    vpunpckhbw {{.*#+}} zmm0 {%k1} {z} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_unpackhi_epi8:
 ; X64:       # %bb.0:
@@ -416,15 +391,10 @@ define <8 x i64> @test_mm512_maskz_unpac
 }
 
 define <8 x i64> @test_mm512_unpackhi_epi16(<8 x i64> %a0, <8 x i64> %a1) {
-; X32-LABEL: test_mm512_unpackhi_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_unpackhi_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_unpackhi_epi16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
   %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
   %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
@@ -433,11 +403,11 @@ define <8 x i64> @test_mm512_unpackhi_ep
 }
 
 define <8 x i64> @test_mm512_mask_unpackhi_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
-; X32-LABEL: test_mm512_mask_unpackhi_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_unpackhi_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} = zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_unpackhi_epi16:
 ; X64:       # %bb.0:
@@ -455,11 +425,11 @@ define <8 x i64> @test_mm512_mask_unpack
 }
 
 define <8 x i64> @test_mm512_maskz_unpackhi_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
-; X32-LABEL: test_mm512_maskz_unpackhi_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_unpackhi_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpunpckhwd {{.*#+}} zmm0 {%k1} {z} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_unpackhi_epi16:
 ; X64:       # %bb.0:
@@ -476,15 +446,10 @@ define <8 x i64> @test_mm512_maskz_unpac
 }
 
 define <8 x i64> @test_mm512_unpacklo_epi8(<8 x i64> %a0, <8 x i64> %a1) {
-; X32-LABEL: test_mm512_unpacklo_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_unpacklo_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_unpacklo_epi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <8 x i64> %a0 to <64 x i8>
   %arg1 = bitcast <8 x i64> %a1 to <64 x i8>
   %res0 = shufflevector <64 x i8> %arg0, <64 x i8> %arg1, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
@@ -493,12 +458,12 @@ define <8 x i64> @test_mm512_unpacklo_ep
 }
 
 define <8 x i64> @test_mm512_mask_unpacklo_epi8(<8 x i64> %a0, i64* %a1, <8 x i64> %a2, <8 x i64> %a3) {
-; X32-LABEL: test_mm512_mask_unpacklo_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovq (%eax), %k1
-; X32-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_unpacklo_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovq (%eax), %k1
+; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_unpacklo_epi8:
 ; X64:       # %bb.0:
@@ -517,12 +482,12 @@ define <8 x i64> @test_mm512_mask_unpack
 }
 
 define <8 x i64> @test_mm512_maskz_unpacklo_epi8(i64* %a0, <8 x i64> %a1, <8 x i64> %a2) {
-; X32-LABEL: test_mm512_maskz_unpacklo_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovq (%eax), %k1
-; X32-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_unpacklo_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovq (%eax), %k1
+; X86-NEXT:    vpunpcklbw {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_unpacklo_epi8:
 ; X64:       # %bb.0:
@@ -540,15 +505,10 @@ define <8 x i64> @test_mm512_maskz_unpac
 }
 
 define <8 x i64> @test_mm512_unpacklo_epi16(<8 x i64> %a0, <8 x i64> %a1) {
-; X32-LABEL: test_mm512_unpacklo_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_unpacklo_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_unpacklo_epi16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <8 x i64> %a0 to <32 x i16>
   %arg1 = bitcast <8 x i64> %a1 to <32 x i16>
   %res0 = shufflevector <32 x i16> %arg0, <32 x i16> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
@@ -557,11 +517,11 @@ define <8 x i64> @test_mm512_unpacklo_ep
 }
 
 define <8 x i64> @test_mm512_mask_unpacklo_epi16(<8 x i64> %a0, i32 %a1, <8 x i64> %a2, <8 x i64> %a3) {
-; X32-LABEL: test_mm512_mask_unpacklo_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_unpacklo_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_unpacklo_epi16:
 ; X64:       # %bb.0:
@@ -579,11 +539,11 @@ define <8 x i64> @test_mm512_mask_unpack
 }
 
 define <8 x i64> @test_mm512_maskz_unpacklo_epi16(i32 %a0, <8 x i64> %a1, <8 x i64> %a2) {
-; X32-LABEL: test_mm512_maskz_unpacklo_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_unpacklo_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpunpcklwd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_unpacklo_epi16:
 ; X64:       # %bb.0:
@@ -600,14 +560,14 @@ define <8 x i64> @test_mm512_maskz_unpac
 }
 
 define i64 @test_mm512_test_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_test_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestmb %zmm0, %zmm1, %k0
-; X32-NEXT:    kshiftrq $32, %k0, %k1
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    kmovd %k1, %edx
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_test_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; X86-NEXT:    kshiftrq $32, %k0, %k1
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    kmovd %k1, %edx
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_test_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -624,16 +584,16 @@ entry:
 }
 
 define i64 @test_mm512_mask_test_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_mask_test_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestmb %zmm0, %zmm1, %k0
-; X32-NEXT:    kshiftrq $32, %k0, %k1
-; X32-NEXT:    kmovd %k1, %edx
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_test_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vptestmb %zmm0, %zmm1, %k0
+; X86-NEXT:    kshiftrq $32, %k0, %k1
+; X86-NEXT:    kmovd %k1, %edx
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_test_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -653,19 +613,12 @@ entry:
 }
 
 define i32 @test_mm512_test_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_test_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestmw %zmm0, %zmm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_test_epi16_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestmw %zmm0, %zmm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_test_epi16_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestmw %zmm0, %zmm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and1.i.i = and <8 x i64> %__B, %__A
   %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
@@ -675,13 +628,13 @@ entry:
 }
 
 define i32 @test_mm512_mask_test_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_mask_test_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_test_epi16_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestmw %zmm0, %zmm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_test_epi16_mask:
 ; X64:       # %bb.0: # %entry
@@ -701,14 +654,14 @@ entry:
 }
 
 define i64 @test_mm512_testn_epi8_mask(<8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_testn_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestnmb %zmm0, %zmm1, %k0
-; X32-NEXT:    kshiftrq $32, %k0, %k1
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    kmovd %k1, %edx
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_testn_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; X86-NEXT:    kshiftrq $32, %k0, %k1
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    kmovd %k1, %edx
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_testn_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -725,16 +678,16 @@ entry:
 }
 
 define i64 @test_mm512_mask_testn_epi8_mask(i64 %__U, <8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_mask_testn_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestnmb %zmm0, %zmm1, %k0
-; X32-NEXT:    kshiftrq $32, %k0, %k1
-; X32-NEXT:    kmovd %k1, %edx
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_testn_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    vptestnmb %zmm0, %zmm1, %k0
+; X86-NEXT:    kshiftrq $32, %k0, %k1
+; X86-NEXT:    kmovd %k1, %edx
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_testn_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -754,19 +707,12 @@ entry:
 }
 
 define i32 @test_mm512_testn_epi16_mask(<8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_testn_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestnmw %zmm0, %zmm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_testn_epi16_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestnmw %zmm0, %zmm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_testn_epi16_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestnmw %zmm0, %zmm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and1.i.i = and <8 x i64> %__B, %__A
   %0 = bitcast <8 x i64> %and1.i.i to <32 x i16>
@@ -776,13 +722,13 @@ entry:
 }
 
 define i32 @test_mm512_mask_testn_epi16_mask(i32 %__U, <8 x i64> %__A, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_mask_testn_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_testn_epi16_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestnmw %zmm0, %zmm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_testn_epi16_mask:
 ; X64:       # %bb.0: # %entry
@@ -802,15 +748,10 @@ entry:
 }
 
 define <4 x i64> @test_mm512_cvtepi16_epi8(<8 x i64> %__A) {
-; X32-LABEL: test_mm512_cvtepi16_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vpmovwb %zmm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_cvtepi16_epi8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpmovwb %zmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_cvtepi16_epi8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %0 = bitcast <8 x i64> %__A to <32 x i16>
   %conv.i = trunc <32 x i16> %0 to <32 x i8>
@@ -819,11 +760,11 @@ entry:
 }
 
 define <4 x i64> @test_mm512_mask_cvtepi16_epi8(<4 x i64> %__O, i32 %__M, <8 x i64> %__A) {
-; X32-LABEL: test_mm512_mask_cvtepi16_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_cvtepi16_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpmovwb %zmm1, %ymm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_cvtepi16_epi8:
 ; X64:       # %bb.0: # %entry
@@ -841,11 +782,11 @@ entry:
 }
 
 define <4 x i64> @test_mm512_maskz_cvtepi16_epi8(i32 %__M, <8 x i64> %__A) {
-; X32-LABEL: test_mm512_maskz_cvtepi16_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_cvtepi16_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpmovwb %zmm0, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_cvtepi16_epi8:
 ; X64:       # %bb.0: # %entry
@@ -862,12 +803,12 @@ entry:
 }
 
 define <8 x i64> @test_mm512_mask2_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, i32 %__U, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_mask2_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
-; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask2_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask2_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry
@@ -887,15 +828,10 @@ entry:
 }
 
 define <8 x i64> @test_mm512_permutex2var_epi16(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm512_permutex2var_epi16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm512_permutex2var_epi16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %0 = bitcast <8 x i64> %__A to <32 x i16>
   %1 = bitcast <8 x i64> %__I to <32 x i16>
@@ -906,11 +842,11 @@ entry:
 }
 
 define <8 x i64> @test_mm512_mask_permutex2var_epi16(<8 x i64> %__A, i32 %__U, <8 x i64> %__I, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_mask_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_mask_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_mask_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry
@@ -929,11 +865,11 @@ entry:
 }
 
 define <8 x i64> @test_mm512_maskz_permutex2var_epi16(i32 %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) {
-; X32-LABEL: test_mm512_maskz_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm512_maskz_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm512_maskz_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll?rev=333843&r1=333842&r2=333843&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll Sun Jun  3 07:56:04 2018
@@ -1,25 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
 
 define i32 at test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k0
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    kunpckwd %k1, %k0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_kunpck_wd:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    kunpckwd %k1, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_kunpck_wd:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k0 # encoding: [0xc5,0xfb,0x92,0xc7]
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    kunpckwd %k1, %k0, %k0 # encoding: [0xc5,0xfc,0x4b,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
   ret i32 %res
 }
@@ -27,19 +27,19 @@ define i32 at test_int_x86_avx512_kunpck_wd
 declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
 
 define i64 at test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k0
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    kunpckdq %k1, %k0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_qd:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_kunpck_qd:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x0c]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_kunpck_qd:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k0 # encoding: [0xc4,0xe1,0xfb,0x92,0xc7]
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    kunpckdq %k1, %k0, %k0 # encoding: [0xc4,0xe1,0xfc,0x4b,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i64 @llvm.x86.avx512.kunpck.dq(i64 %x0, i64 %x1)
   ret i64 %res
 }
@@ -47,26 +47,26 @@ define i64 at test_int_x86_avx512_kunpck_qd
 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
 
   define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm1
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm2 {%k1} {z}
-; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm1
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm2 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al # encoding: [0x8a,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastb %eax, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xc8]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpbroadcastb %eax, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xc0]
+; X86-NEXT:    vpbroadcastb %eax, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xd0]
+; X86-NEXT:    vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2]
+; X86-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastb %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xcf]
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpbroadcastb %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xc7]
+; X64-NEXT:    vpbroadcastb %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xd7]
+; X64-NEXT:    vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2]
+; X64-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
     %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
     %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask)
     %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask)
@@ -77,26 +77,26 @@ declare <64 x i8> @llvm.x86.avx512.mask.
 
 declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)
   define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm1
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm2 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm1
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm2 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb7,0x44,0x24,0x04]
+; X86-NEXT:    vpbroadcastw %eax, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xc8]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpbroadcastw %eax, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xc0]
+; X86-NEXT:    vpbroadcastw %eax, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xd0]
+; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xcf]
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpbroadcastw %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xc7]
+; X64-NEXT:    vpbroadcastw %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xd7]
+; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
     %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
     %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask)
    %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask)
@@ -108,23 +108,23 @@ declare <32 x i16> @llvm.x86.avx512.mask
 declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)
 
 define void at test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdx, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rdi) {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vmovdqu8 %zmm0, (%ecx) {%k1}
-; AVX512F-32-NEXT:    vmovdqu64 %zmm0, (%eax)
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovdqu8 %zmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x7f,0x01]
+; X86-NEXT:    vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdx, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xca]
+; X64-NEXT:    vmovdqu8 %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x7f,0x07]
+; X64-NEXT:    vmovdqu64 %zmm0, (%rsi) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x06]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
   call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
   ret void
@@ -133,23 +133,23 @@ define void at test_int_x86_avx512_mask_sto
 declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)
 
 define void at test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edx, %k1
-; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rdi) {%k1}
-; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vmovdqu16 %zmm0, (%ecx) {%k1}
-; AVX512F-32-NEXT:    vmovdqu64 %zmm0, (%eax)
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovdqu16 %zmm0, (%ecx) {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x7f,0x01]
+; X86-NEXT:    vmovdqu64 %zmm0, (%eax) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x00]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
+; X64-NEXT:    vmovdqu16 %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x7f,0x07]
+; X64-NEXT:    vmovdqu64 %zmm0, (%rsi) # encoding: [0x62,0xf1,0xfe,0x48,0x7f,0x06]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
   call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
   ret void
@@ -158,25 +158,25 @@ define void at test_int_x86_avx512_mask_sto
 declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; AVX512BW-NEXT:    kmovd %edx, %k1
-; AVX512BW-NEXT:    vmovdqu16 (%rsi), %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    vmovdqu64 (%ecx), %zmm0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
-; AVX512F-32-NEXT:    vmovdqu16 (%ecx), %zmm1 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0x00]
+; X86-NEXT:    vmovdqu16 (%ecx), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x09]
+; X86-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqu64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
+; X64-NEXT:    kmovd %edx, %k1 # encoding: [0xc5,0xfb,0x92,0xca]
+; X64-NEXT:    vmovdqu16 (%rsi), %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0x06]
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0x0f]
+; X64-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1)
   %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask)
@@ -187,25 +187,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqu64 (%rdi), %zmm0
-; AVX512BW-NEXT:    kmovq %rdx, %k1
-; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    vmovdqu64 (%ecx), %zmm0
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vmovdqu8 (%eax), %zmm0 {%k1}
-; AVX512F-32-NEXT:    vmovdqu8 (%ecx), %zmm1 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddb %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
+; X86-NEXT:    vmovdqu64 (%ecx), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x01]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x0c]
+; X86-NEXT:    vmovdqu8 (%eax), %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0x00]
+; X86-NEXT:    vmovdqu8 (%ecx), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x09]
+; X86-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqu64 (%rdi), %zmm0 # encoding: [0x62,0xf1,0xfe,0x48,0x6f,0x07]
+; X64-NEXT:    kmovq %rdx, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xca]
+; X64-NEXT:    vmovdqu8 (%rsi), %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0x06]
+; X64-NEXT:    vmovdqu8 (%rdi), %zmm1 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0x0f]
+; X64-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1)
   %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask)
@@ -216,19 +216,14 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
 
 define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_psll_dq_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
-; AVX512BW-NEXT:    vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
-; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_psll_dq_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
-; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_int_x86_avx512_psll_dq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpslldq $8, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xf8,0x08]
+; CHECK-NEXT:    # zmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55]
+; CHECK-NEXT:    vpslldq $4, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xf8,0x04]
+; CHECK-NEXT:    # zmm0 = zero,zero,zero,zero,zmm0[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,zmm0[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,zmm0[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,zmm0[48,49,50,51,52,53,54,55,56,57,58,59]
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -236,16 +231,18 @@ define <8 x i64>@test_int_x86_avx512_psl
 }
 
 define <8 x i64>@test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) {
-; AVX512BW-LABEL: test_int_x86_avx512_psll_load_dq_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_psll_load_dq_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_psll_load_dq_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpslldq $4, (%eax), %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0x38,0x04]
+; X86-NEXT:    # zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_psll_load_dq_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpslldq $4, (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0x3f,0x04]
+; X64-NEXT:    # zmm0 = zero,zero,zero,zero,mem[0,1,2,3,4,5,6,7,8,9,10,11],zero,zero,zero,zero,mem[16,17,18,19,20,21,22,23,24,25,26,27],zero,zero,zero,zero,mem[32,33,34,35,36,37,38,39,40,41,42,43],zero,zero,zero,zero,mem[48,49,50,51,52,53,54,55,56,57,58,59]
+; X64-NEXT:    retq # encoding: [0xc3]
   %x0 = load <8 x i64>, <8 x i64> *%p0
   %res = call <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64> %x0, i32 4)
   ret <8 x i64> %res
@@ -254,19 +251,14 @@ define <8 x i64>@test_int_x86_avx512_psl
 declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
 
 define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_psrl_dq_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
-; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_psrl_dq_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsrldq {{.*#+}} zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-32-NEXT:    vpsrldq {{.*#+}} zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
-; AVX512F-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_int_x86_avx512_psrl_dq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrldq $8, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xd8,0x08]
+; CHECK-NEXT:    # zmm1 = zmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[40,41,42,43,44,45,46,47],zero,zero,zero,zero,zero,zero,zero,zero,zmm0[56,57,58,59,60,61,62,63],zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpsrldq $4, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0xd8,0x04]
+; CHECK-NEXT:    # zmm0 = zmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zmm0[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zmm0[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,zmm0[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 8)
   %res1 = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
   %res2 = add <8 x i64> %res, %res1
@@ -274,16 +266,18 @@ define <8 x i64>@test_int_x86_avx512_psr
 }
 
 define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) {
-; AVX512BW-LABEL: test_int_x86_avx512_psrl_load_dq_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_psrl_load_dq_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpsrldq {{.*#+}} zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_psrl_load_dq_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsrldq $4, (%eax), %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0x18,0x04]
+; X86-NEXT:    # zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_psrl_load_dq_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrldq $4, (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x73,0x1f,0x04]
+; X64-NEXT:    # zmm0 = mem[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,mem[20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,mem[36,37,38,39,40,41,42,43,44,45,46,47],zero,zero,zero,zero,mem[52,53,54,55,56,57,58,59,60,61,62,63],zero,zero,zero,zero
+; X64-NEXT:    retq # encoding: [0xc3]
   %x0 = load <8 x i64>, <8 x i64> *%p0
   %res = call <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64> %x0, i32 4)
   ret <8 x i64> %res
@@ -292,25 +286,31 @@ define <8 x i64>@test_int_x86_avx512_psr
 declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpalignr {{.*#+}} zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512F-32-NEXT:    vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_palignr_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xd9,0x02]
+; X86-NEXT:    # zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x0f,0xd1,0x02]
+; X86-NEXT:    # zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; X86-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02]
+; X86-NEXT:    # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; X86-NEXT:    vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3]
+; X86-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_palignr_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xd9,0x02]
+; X64-NEXT:    # zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x0f,0xd1,0x02]
+; X64-NEXT:    # zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; X64-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02]
+; X64-NEXT:    # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
+; X64-NEXT:    vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3]
+; X64-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
   %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1)
@@ -322,25 +322,31 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512BW-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpshufhw {{.*#+}} zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512F-32-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshufhw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xd0,0x03]
+; X86-NEXT:    # zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpshufhw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x49,0x70,0xc8,0x03]
+; X86-NEXT:    # zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; X86-NEXT:    vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03]
+; X86-NEXT:    # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshufhw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xd0,0x03]
+; X64-NEXT:    # zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpshufhw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x49,0x70,0xc8,0x03]
+; X64-NEXT:    # zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; X64-NEXT:    vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03]
+; X64-NEXT:    # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
+; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
@@ -352,25 +358,31 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512BW-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpshuflw {{.*#+}} zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512F-32-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshuflw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xd0,0x03]
+; X86-NEXT:    # zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpshuflw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x70,0xc8,0x03]
+; X86-NEXT:    # zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; X86-NEXT:    vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03]
+; X86-NEXT:    # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshuflw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xd0,0x03]
+; X64-NEXT:    # zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpshuflw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x70,0xc8,0x03]
+; X64-NEXT:    # zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; X64-NEXT:    vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03]
+; X64-NEXT:    # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
+; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
@@ -380,44 +392,44 @@ define <32 x i16>@test_int_x86_avx512_ma
 }
 
 define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
-; AVX512BW-LABEL: test_pcmpeq_b:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_pcmpeq_b:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_pcmpeq_b:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_pcmpeq_b:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpeq_b:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    andq %rdi, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpeq_b:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_pcmpeq_b:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04]
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpeq_b:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    andq %rdi, %rax # encoding: [0x48,0x21,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -425,39 +437,32 @@ define i64 @test_mask_pcmpeq_b(<64 x i8>
 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_pcmpeq_w:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_pcmpeq_w:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_pcmpeq_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpeq_w:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    andl %edi, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpeq_w:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_pcmpeq_w:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpeq_w:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    andl %edi, %eax # encoding: [0x21,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -465,44 +470,44 @@ define i32 @test_mask_pcmpeq_w(<32 x i16
 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
 
 define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
-; AVX512BW-LABEL: test_pcmpgt_b:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_pcmpgt_b:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_pcmpgt_b:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_pcmpgt_b:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
 }
 
 define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpgt_b:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    andq %rdi, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpgt_b:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_pcmpgt_b:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04]
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx # encoding: [0x23,0x54,0x24,0x08]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpgt_b:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    andq %rdi, %rax # encoding: [0x48,0x21,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
 }
@@ -510,39 +515,32 @@ define i64 @test_mask_pcmpgt_b(<64 x i8>
 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_pcmpgt_w:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_pcmpgt_w:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_pcmpgt_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
 }
 
 define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_pcmpgt_w:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    andl %edi, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_pcmpgt_w:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_pcmpgt_w:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax # encoding: [0x23,0x44,0x24,0x04]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_pcmpgt_w:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    andl %edi, %eax # encoding: [0x21,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
 }
@@ -552,21 +550,25 @@ declare i32 @llvm.x86.avx512.mask.pcmpgt
 declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpunpckhbw {{.*#+}} zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xd9]
+; X86-NEXT:    # zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x68,0xd1]
+; X86-NEXT:    # zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xd9]
+; X64-NEXT:    # zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x68,0xd1]
+; X64-NEXT:    # zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -576,21 +578,25 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpunpcklbw {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xd9]
+; X86-NEXT:    # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x60,0xd1]
+; X86-NEXT:    # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xd9]
+; X64-NEXT:    # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x60,0xd1]
+; X64-NEXT:    # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -600,21 +606,25 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpunpckhwd {{.*#+}} zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xd9]
+; X86-NEXT:    # zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x69,0xd1]
+; X86-NEXT:    # zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xd9]
+; X64-NEXT:    # zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x69,0xd1]
+; X64-NEXT:    # zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -624,21 +634,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpunpcklwd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xd9]
+; X86-NEXT:    # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x61,0xd1]
+; X86-NEXT:    # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xd9]
+; X64-NEXT:    # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x61,0xd1]
+; X64-NEXT:    # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -648,21 +662,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xd9]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3c,0xd1]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xd9]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3c,0xd1]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -672,21 +686,21 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xee,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xee,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -696,21 +710,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaxub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xd9]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xde,0xd1]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaxub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xd9]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaxub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xde,0xd1]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -720,21 +734,21 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3e,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3e,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -744,21 +758,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpminsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xd9]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x38,0xd1]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmins_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpminsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xd9]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpminsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x38,0xd1]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -768,21 +782,21 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmins_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmins_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpminsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xea,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmins_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpminsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpminsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xea,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -792,21 +806,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpminub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xd9]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xda,0xd1]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pminu_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpminub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xd9]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpminub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xda,0xd1]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -816,21 +830,21 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pminu_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pminu_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpminuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3a,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pminu_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpminuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpminuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3a,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -840,25 +854,31 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512F-32-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmovzxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xd0]
+; X86-NEXT:    # zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovzxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x30,0xc8]
+; X86-NEXT:    # zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X86-NEXT:    vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0]
+; X86-NEXT:    # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmovzxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xd0]
+; X64-NEXT:    # zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmovzxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x30,0xc8]
+; X64-NEXT:    # zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X64-NEXT:    vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0]
+; X64-NEXT:    # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
@@ -870,25 +890,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm2
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmovsxbw %ymm0, %zmm2
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmovsxbw %ymm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmovsxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xd0]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovsxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x20,0xc8]
+; X86-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0]
+; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmovsxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xd0]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmovsxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x20,0xc8]
+; X64-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0]
+; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1)
@@ -900,25 +920,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm3, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm3, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psrl_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd1,0xd1]
+; X86-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd1,0xc1]
+; X86-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psrl_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd1,0xd1]
+; X64-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd1,0xc1]
+; X64-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -930,25 +950,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm2
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm2
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrlw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xd0,0x03]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xd0,0x03]
+; X86-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xd0,0x03]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xd0,0x03]
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xd0,0x03]
+; X64-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xd0,0x03]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -960,25 +980,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psra_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsraw %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe1,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe1,0xd1]
+; X86-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe1,0xc1]
+; X86-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psra_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsraw %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe1,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe1,0xd1]
+; X64-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe1,0xc1]
+; X64-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -990,25 +1010,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psra_wi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm2
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_wi_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsraw $3, %zmm0, %zmm2
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psra_wi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsraw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xe0,0x03]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xe0,0x03]
+; X86-NEXT:    vpsraw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xe0,0x03]
+; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psra_wi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsraw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xe0,0x03]
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xe0,0x03]
+; X64-NEXT:    vpsraw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xe0,0x03]
+; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
@@ -1020,25 +1040,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psll_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf1,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf1,0xd1]
+; X86-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xf1,0xc1]
+; X86-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psll_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf1,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf1,0xd1]
+; X64-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xf1,0xc1]
+; X64-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -1050,25 +1070,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psll_wi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm2
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_wi_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpsllw $3, %zmm0, %zmm2
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psll_wi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xf0,0x03]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xf0,0x03]
+; X86-NEXT:    vpsllw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xf0,0x03]
+; X86-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X86-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psll_wi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x6d,0x48,0x71,0xf0,0x03]
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xf0,0x03]
+; X64-NEXT:    vpsllw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xf0,0x03]
+; X64-NEXT:    vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2]
+; X64-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1)
@@ -1080,21 +1100,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpshufb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xd9]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpshufb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xd9]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -1105,17 +1125,17 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64)
 
 define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_cvtmask2b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf8,0x90,0x44,0x24,0x04]
+; X86-NEXT:    vpmovm2b %k0, %zmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x28,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_cvtmask2b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k0 # encoding: [0xc4,0xe1,0xfb,0x92,0xc7]
+; X64-NEXT:    vpmovm2b %k0, %zmm0 # encoding: [0x62,0xf2,0x7e,0x48,0x28,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64 %x0)
   ret <64 x i8> %res
 }
@@ -1123,133 +1143,128 @@ define <64 x i8>@test_int_x86_avx512_cvt
 declare <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32)
 
 define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k0
-; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_cvtmask2w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04]
+; X86-NEXT:    vpmovm2w %k0, %zmm0 # encoding: [0x62,0xf2,0xfe,0x48,0x28,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_cvtmask2w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k0 # encoding: [0xc5,0xfb,0x92,0xc7]
+; X64-NEXT:    vpmovm2w %k0, %zmm0 # encoding: [0x62,0xf2,0xfe,0x48,0x28,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32 %x0)
   ret <32 x i16> %res
 }
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packs_epi32_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x6b,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmb_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmb_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1258,20 +1273,20 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmbk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmbk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1280,18 +1295,18 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmbkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmbkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1302,102 +1317,97 @@ define <32 x i16> @test_mask_packs_epi32
 declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packs_epi16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x63,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x63,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x63,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -1407,118 +1417,113 @@ declare <64 x i8> @llvm.x86.avx512.mask.
 
 
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packus_epi32_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x2b,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmb_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x58,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmb_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x58,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1527,20 +1532,20 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmbk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x59,0x2b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmbk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x59,0x2b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1549,18 +1554,18 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmbkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xd9,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmbkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xd9,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -1571,102 +1576,97 @@ define <32 x i16> @test_mask_packus_epi3
 declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packus_epi16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x67,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x67,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x67,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
   ret <64 x i8> %res
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
   ret <64 x i8> %res
@@ -1675,78 +1675,78 @@ define <64 x i8> @test_mask_packus_epi16
 declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
 
 define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; AVX512BW-LABEL: test_cmp_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
-; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
-; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    leaq -1(%rcx,%rax), %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_cmp_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    .cfi_offset %esi, -12
-; AVX512F-32-NEXT:    .cfi_offset %edi, -8
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %eax
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    addl %ecx, %esi
-; AVX512F-32-NEXT:    adcl %eax, %edx
-; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %eax
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    addl %esi, %ecx
-; AVX512F-32-NEXT:    adcl %edx, %eax
-; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    addl %ecx, %esi
-; AVX512F-32-NEXT:    adcl %eax, %edx
-; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %ecx
-; AVX512F-32-NEXT:    kmovd %k0, %edi
-; AVX512F-32-NEXT:    addl %esi, %edi
-; AVX512F-32-NEXT:    adcl %edx, %ecx
-; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %edi, %eax
-; AVX512F-32-NEXT:    adcl %ecx, %edx
-; AVX512F-32-NEXT:    addl $-1, %eax
-; AVX512F-32-NEXT:    adcl $-1, %edx
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    popl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_cmp_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi # encoding: [0x57]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
+; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %esi, %ecx # encoding: [0x01,0xf1]
+; X86-NEXT:    adcl %edx, %eax # encoding: [0x11,0xd0]
+; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
+; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
+; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
+; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X86-NEXT:    adcl %ecx, %edx # encoding: [0x11,0xca]
+; X86-NEXT:    addl $-1, %eax # encoding: [0x83,0xc0,0xff]
+; X86-NEXT:    adcl $-1, %edx # encoding: [0x83,0xd2,0xff]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi # encoding: [0x5f]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xc0]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
   %ret1 = add i64 %res0, %res1
@@ -1766,106 +1766,106 @@ define i64 @test_cmp_b_512(<64 x i8> %a0
 }
 
 define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; AVX512BW-LABEL: test_mask_cmp_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
-; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rdx
-; AVX512BW-NEXT:    addq %rcx, %rdx
-; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rdx, %rax
-; AVX512BW-NEXT:    addq %rdi, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_cmp_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %ebp
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    pushl %ebx
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    pushl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 20
-; AVX512F-32-NEXT:    .cfi_offset %esi, -20
-; AVX512F-32-NEXT:    .cfi_offset %edi, -16
-; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kmovd %esi, %k0
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %eax
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %edx
-; AVX512F-32-NEXT:    vpcmpgtb %zmm0, %zmm1, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %edi
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ebx
-; AVX512F-32-NEXT:    addl %edx, %ebx
-; AVX512F-32-NEXT:    adcl %eax, %edi
-; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %eax
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %edx
-; AVX512F-32-NEXT:    addl %ebx, %edx
-; AVX512F-32-NEXT:    adcl %edi, %eax
-; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %edi
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ebx
-; AVX512F-32-NEXT:    addl %edx, %ebx
-; AVX512F-32-NEXT:    adcl %eax, %edi
-; AVX512F-32-NEXT:    vpcmpnltb %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %ebp
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ecx
-; AVX512F-32-NEXT:    addl %ebx, %ecx
-; AVX512F-32-NEXT:    adcl %edi, %ebp
-; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    adcl %ebp, %edx
-; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT:    popl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    popl %ebx
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    popl %ebp
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_cmp_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp # encoding: [0x55]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %ebx # encoding: [0x53]
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %edi # encoding: [0x57]
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x18]
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1]
+; X86-NEXT:    kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    vpcmpgtb %zmm0, %zmm1, %k2 # encoding: [0x62,0xf1,0x75,0x48,0x64,0xd0]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda]
+; X86-NEXT:    addl %edx, %ebx # encoding: [0x01,0xd3]
+; X86-NEXT:    adcl %eax, %edi # encoding: [0x11,0xc7]
+; X86-NEXT:    vpcmpleb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x02]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    addl %ebx, %edx # encoding: [0x01,0xda]
+; X86-NEXT:    adcl %edi, %eax # encoding: [0x11,0xf8]
+; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda]
+; X86-NEXT:    addl %edx, %ebx # encoding: [0x01,0xd3]
+; X86-NEXT:    adcl %eax, %edi # encoding: [0x11,0xc7]
+; X86-NEXT:    vpcmpnltb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x05]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %ebp # encoding: [0xc5,0xfb,0x93,0xeb]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
+; X86-NEXT:    addl %ebx, %ecx # encoding: [0x01,0xd9]
+; X86-NEXT:    adcl %edi, %ebp # encoding: [0x11,0xfd]
+; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xd1]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    adcl %ebp, %edx # encoding: [0x11,0xea]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x18]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    popl %edi # encoding: [0x5f]
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    popl %ebx # encoding: [0x5b]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebp # encoding: [0x5d]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_cmp_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x64,0xc0]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
   %ret1 = add i64 %res0, %res1
@@ -1887,78 +1887,78 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
 define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
-; AVX512BW-LABEL: test_ucmp_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
-; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
-; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    leaq -1(%rcx,%rax), %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_ucmp_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    .cfi_offset %esi, -12
-; AVX512F-32-NEXT:    .cfi_offset %edi, -8
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %eax
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    addl %ecx, %esi
-; AVX512F-32-NEXT:    adcl %eax, %edx
-; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %eax
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    addl %esi, %ecx
-; AVX512F-32-NEXT:    adcl %edx, %eax
-; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    addl %ecx, %esi
-; AVX512F-32-NEXT:    adcl %eax, %edx
-; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %ecx
-; AVX512F-32-NEXT:    kmovd %k0, %edi
-; AVX512F-32-NEXT:    addl %esi, %edi
-; AVX512F-32-NEXT:    adcl %edx, %ecx
-; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %edi, %eax
-; AVX512F-32-NEXT:    adcl %ecx, %edx
-; AVX512F-32-NEXT:    addl $-1, %eax
-; AVX512F-32-NEXT:    adcl $-1, %edx
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    popl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_ucmp_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi # encoding: [0x57]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    .cfi_offset %edi, -8
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
+; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %esi, %ecx # encoding: [0x01,0xf1]
+; X86-NEXT:    adcl %edx, %eax # encoding: [0x11,0xd0]
+; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %ecx, %esi # encoding: [0x01,0xce]
+; X86-NEXT:    adcl %eax, %edx # encoding: [0x11,0xc2]
+; X86-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    kmovd %k0, %edi # encoding: [0xc5,0xfb,0x93,0xf8]
+; X86-NEXT:    addl %esi, %edi # encoding: [0x01,0xf7]
+; X86-NEXT:    adcl %edx, %ecx # encoding: [0x11,0xd1]
+; X86-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X86-NEXT:    adcl %ecx, %edx # encoding: [0x11,0xca]
+; X86-NEXT:    addl $-1, %eax # encoding: [0x83,0xc0,0xff]
+; X86-NEXT:    adcl $-1, %edx # encoding: [0x83,0xd2,0xff]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %edi # encoding: [0x5f]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x01]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
   %ret1 = add i64 %res0, %res1
@@ -1978,106 +1978,106 @@ define i64 @test_ucmp_b_512(<64 x i8> %a
 }
 
 define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-; AVX512BW-LABEL: test_mask_x86_avx512_ucmp_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rcx, %rax
-; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rcx
-; AVX512BW-NEXT:    addq %rax, %rcx
-; AVX512BW-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rdx
-; AVX512BW-NEXT:    addq %rcx, %rdx
-; AVX512BW-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    addq %rdx, %rax
-; AVX512BW-NEXT:    addq %rdi, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %ebp
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    pushl %ebx
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    pushl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 20
-; AVX512F-32-NEXT:    .cfi_offset %esi, -20
-; AVX512F-32-NEXT:    .cfi_offset %edi, -16
-; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kmovd %esi, %k0
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %eax
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %edx
-; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %edi
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ebx
-; AVX512F-32-NEXT:    addl %edx, %ebx
-; AVX512F-32-NEXT:    adcl %eax, %edi
-; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %eax
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %edx
-; AVX512F-32-NEXT:    addl %ebx, %edx
-; AVX512F-32-NEXT:    adcl %edi, %eax
-; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %edi
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ebx
-; AVX512F-32-NEXT:    addl %edx, %ebx
-; AVX512F-32-NEXT:    adcl %eax, %edi
-; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k3
-; AVX512F-32-NEXT:    kmovd %k3, %ebp
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k2
-; AVX512F-32-NEXT:    kmovd %k2, %ecx
-; AVX512F-32-NEXT:    addl %ebx, %ecx
-; AVX512F-32-NEXT:    adcl %edi, %ebp
-; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k2
-; AVX512F-32-NEXT:    kshiftrq $32, %k2, %k3
-; AVX512F-32-NEXT:    kandd %k1, %k3, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    kandd %k0, %k2, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    adcl %ebp, %edx
-; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT:    popl %edi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    popl %ebx
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    popl %ebp
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp # encoding: [0x55]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %ebx # encoding: [0x53]
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %edi # encoding: [0x57]
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x18]
+; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1]
+; X86-NEXT:    kmovd %esi, %k0 # encoding: [0xc5,0xfb,0x92,0xc6]
+; X86-NEXT:    kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    vpcmpltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x01]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda]
+; X86-NEXT:    addl %edx, %ebx # encoding: [0x01,0xd3]
+; X86-NEXT:    adcl %eax, %edi # encoding: [0x11,0xc7]
+; X86-NEXT:    vpcmpleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x02]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %eax # encoding: [0xc5,0xfb,0x93,0xc3]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %edx # encoding: [0xc5,0xfb,0x93,0xd2]
+; X86-NEXT:    addl %ebx, %edx # encoding: [0x01,0xda]
+; X86-NEXT:    adcl %edi, %eax # encoding: [0x11,0xf8]
+; X86-NEXT:    vpcmpneqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xd1,0x04]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %edi # encoding: [0xc5,0xfb,0x93,0xfb]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %ebx # encoding: [0xc5,0xfb,0x93,0xda]
+; X86-NEXT:    addl %edx, %ebx # encoding: [0x01,0xd3]
+; X86-NEXT:    adcl %eax, %edi # encoding: [0x11,0xc7]
+; X86-NEXT:    vpcmpnltub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x05]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k3 # encoding: [0xc4,0xe1,0xe5,0x41,0xd9]
+; X86-NEXT:    kmovd %k3, %ebp # encoding: [0xc5,0xfb,0x93,0xeb]
+; X86-NEXT:    kandd %k0, %k2, %k2 # encoding: [0xc4,0xe1,0xed,0x41,0xd0]
+; X86-NEXT:    kmovd %k2, %ecx # encoding: [0xc5,0xfb,0x93,0xca]
+; X86-NEXT:    addl %ebx, %ecx # encoding: [0x01,0xd9]
+; X86-NEXT:    adcl %edi, %ebp # encoding: [0x11,0xfd]
+; X86-NEXT:    vpcmpnleub %zmm1, %zmm0, %k2 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xd1,0x06]
+; X86-NEXT:    kshiftrq $32, %k2, %k3 # encoding: [0xc4,0xe3,0xf9,0x31,0xda,0x20]
+; X86-NEXT:    kandd %k1, %k3, %k1 # encoding: [0xc4,0xe1,0xe5,0x41,0xc9]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    kandd %k0, %k2, %k0 # encoding: [0xc4,0xe1,0xed,0x41,0xc0]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    adcl %ebp, %edx # encoding: [0x11,0xea]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x18]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    popl %edi # encoding: [0x5f]
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    popl %ebx # encoding: [0x5b]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    popl %ebp # encoding: [0x5d]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_x86_avx512_ucmp_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x74,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x01]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
+; X64-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT:    addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
+; X64-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
+; X64-NEXT:    kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
+; X64-NEXT:    addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
   %ret1 = add i64 %res0, %res1
@@ -2099,49 +2099,49 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
 define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; AVX512BW-LABEL: test_cmp_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
-; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
-; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    leal -1(%rcx,%rax), %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_cmp_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    addl %eax, %ecx
-; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    addl %eax, %ecx
-; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    leal -1(%ecx,%eax), %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_cmp_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_cmp_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 # encoding: [0x62,0xf1,0x75,0x48,0x65,0xc0]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
   %ret1 = add i32 %res0, %res1
@@ -2161,59 +2161,59 @@ define i32 @test_cmp_w_512(<32 x i16> %a
 }
 
 define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; AVX512BW-LABEL: test_mask_cmp_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
-; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %edx
-; AVX512BW-NEXT:    addl %ecx, %edx
-; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %edx, %eax
-; AVX512BW-NEXT:    addl %edi, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_cmp_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    .cfi_offset %esi, -8
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %edx
-; AVX512F-32-NEXT:    addl %eax, %edx
-; AVX512F-32-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %edx, %eax
-; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %edx
-; AVX512F-32-NEXT:    addl %eax, %edx
-; AVX512F-32-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    addl %edx, %esi
-; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_cmp_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_cmp_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpgtw %zmm0, %zmm1, %k0 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x65,0xc0]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
   %ret1 = add i32 %res0, %res1
@@ -2235,49 +2235,49 @@ define i32 @test_mask_cmp_w_512(<32 x i1
 declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
 define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
-; AVX512BW-LABEL: test_ucmp_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
-; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
-; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    leal -1(%rcx,%rax), %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_ucmp_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    addl %eax, %ecx
-; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    addl %eax, %ecx
-; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    leal -1(%ecx,%eax), %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_ucmp_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_ucmp_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x01]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
   %ret1 = add i32 %res0, %res1
@@ -2297,59 +2297,59 @@ define i32 @test_ucmp_w_512(<32 x i16> %
 }
 
 define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-; AVX512BW-LABEL: test_mask_ucmp_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %ecx, %eax
-; AVX512BW-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %ecx
-; AVX512BW-NEXT:    addl %eax, %ecx
-; AVX512BW-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %edx
-; AVX512BW-NEXT:    addl %ecx, %edx
-; AVX512BW-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    addl %edx, %eax
-; AVX512BW-NEXT:    addl %edi, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_mask_ucmp_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    .cfi_offset %esi, -8
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %edx
-; AVX512F-32-NEXT:    addl %eax, %edx
-; AVX512F-32-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %edx, %eax
-; AVX512F-32-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %edx
-; AVX512F-32-NEXT:    addl %eax, %edx
-; AVX512F-32-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    addl %edx, %esi
-; AVX512F-32-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_mask_ucmp_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
+; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
+; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X86-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X86-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT:    addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_ucmp_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    vpcmpltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x01]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X64-NEXT:    vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X64-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT:    addl %eax, %ecx # encoding: [0x01,0xc1]
+; X64-NEXT:    vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
+; X64-NEXT:    kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X64-NEXT:    addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT:    vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
   %ret1 = add i32 %res0, %res1
@@ -2374,21 +2374,21 @@ declare i32 @llvm.x86.avx512.mask.ucmp.w
 declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
-; AVX512BW-LABEL: mm512_avg_epu8:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: mm512_avg_epu8:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: mm512_avg_epu8:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1]
+; X86-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: mm512_avg_epu8:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1]
+; X64-NEXT:    vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -2399,21 +2399,21 @@ define <64 x i8>@mm512_avg_epu8(<64 x i8
 declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: mm512_avg_epu16:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: mm512_avg_epu16:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: mm512_avg_epu16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: mm512_avg_epu16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2423,21 +2423,21 @@ define <32 x i16>@mm512_avg_epu16(<32 x
 declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpabsw %zmm0, %zmm2
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm2
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpabsw %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpabsw %zmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x1d,0xd0]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpabsw %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x1d,0xc8]
+; X86-NEXT:    vpaddw %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pabs_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpabsw %zmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x1d,0xd0]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpabsw %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x1d,0xc8]
+; X64-NEXT:    vpaddw %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2447,21 +2447,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
 
 define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pabs_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpabsb %zmm0, %zmm2
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm2, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pabs_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm2
-; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpabsb %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm2, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpabsb %zmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x1c,0xd0]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpabsb %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x1c,0xc8]
+; X86-NEXT:    vpaddb %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pabs_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpabsb %zmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x1c,0xd0]
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpabsb %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x1c,0xc8]
+; X64-NEXT:    vpaddb %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 -1)
   %res2 = add <64 x i8> %res, %res1
@@ -2471,34 +2471,34 @@ define <64 x i8>@test_int_x86_avx512_mas
 declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)
 
 define i64 at test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestm_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vptestmb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    andq %rax, %rdi
-; AVX512BW-NEXT:    addq %rdi, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    .cfi_offset %esi, -8
-; AVX512F-32-NEXT:    vptestmb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %ecx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    andl %ecx, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    andl %esi, %eax
-; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    adcl %ecx, %edx
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_ptestm_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    vptestmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    andl %ecx, %edx # encoding: [0x21,0xca]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    andl %esi, %eax # encoding: [0x21,0xf0]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT:    adcl %ecx, %edx # encoding: [0x11,0xca]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_ptestm_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    andq %rax, %rdi # encoding: [0x48,0x21,0xc7]
+; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
   %res2 = add i64 %res, %res1
@@ -2508,24 +2508,24 @@ define i64 at test_int_x86_avx512_ptestm_b_
 declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)
 
 define i32 at test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestm_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vptestmw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    andl %eax, %edi
-; AVX512BW-NEXT:    addl %edi, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestm_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vptestmw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    andl %ecx, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_ptestm_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    andl %ecx, %eax # encoding: [0x21,0xc8]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_ptestm_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    andl %eax, %edi # encoding: [0x21,0xc7]
+; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
   %res2 = add i32 %res, %res1
@@ -2535,34 +2535,34 @@ define i32 at test_int_x86_avx512_ptestm_w_
 declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)
 
 define i64 at test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vptestnmb %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    andq %rax, %rdi
-; AVX512BW-NEXT:    addq %rdi, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_b_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    .cfi_offset %esi, -8
-; AVX512F-32-NEXT:    vptestnmb %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k1, %ecx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    andl %ecx, %edx
-; AVX512F-32-NEXT:    kmovd %k0, %esi
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    andl %esi, %eax
-; AVX512F-32-NEXT:    addl %esi, %eax
-; AVX512F-32-NEXT:    adcl %ecx, %edx
-; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 4
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi # encoding: [0x56]
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    vptestnmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x26,0xc1]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
+; X86-NEXT:    andl %ecx, %edx # encoding: [0x21,0xca]
+; X86-NEXT:    kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
+; X86-NEXT:    andl %esi, %eax # encoding: [0x21,0xf0]
+; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT:    adcl %ecx, %edx # encoding: [0x11,0xca]
+; X86-NEXT:    popl %esi # encoding: [0x5e]
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_ptestnm_b_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestnmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x26,0xc1]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    andq %rax, %rdi # encoding: [0x48,0x21,0xc7]
+; X64-NEXT:    addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2)
   %res1 = call i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8> %x0, <64 x i8> %x1, i64-1)
   %res2 = add i64 %res, %res1
@@ -2572,24 +2572,24 @@ define i64 at test_int_x86_avx512_ptestnm_b
 declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)
 
 define i32 at test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_ptestnm_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vptestnmw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    andl %eax, %edi
-; AVX512BW-NEXT:    addl %edi, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_ptestnm_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vptestnmw %zmm1, %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %ecx
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    andl %ecx, %eax
-; AVX512F-32-NEXT:    addl %ecx, %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vptestnmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x26,0xc1]
+; X86-NEXT:    kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    andl %ecx, %eax # encoding: [0x21,0xc8]
+; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_ptestnm_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vptestnmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x26,0xc1]
+; X64-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT:    andl %eax, %edi # encoding: [0x21,0xc7]
+; X64-NEXT:    addl %edi, %eax # encoding: [0x01,0xf8]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16> %x0, <32 x i16> %x1, i32-1)
   %res2 = add i32 %res, %res1
@@ -2599,21 +2599,21 @@ define i32 at test_int_x86_avx512_ptestnm_w
 declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
 
 define i64 at test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_cvtb2mask_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512BW-NEXT:    kmovq %k0, %rax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_cvtb2mask_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
-; AVX512F-32-NEXT:    kshiftrq $32, %k0, %k1
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    kmovd %k1, %edx
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_cvtb2mask_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmovb2m %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x29,0xc0]
+; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
+; X86-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT:    kmovd %k1, %edx # encoding: [0xc5,0xfb,0x93,0xd1]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_cvtb2mask_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmovb2m %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x29,0xc0]
+; X64-NEXT:    kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
     %res = call i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8> %x0)
     ret i64 %res
 }
@@ -2621,19 +2621,12 @@ define i64 at test_int_x86_avx512_cvtb2mask
 declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>)
 
 define i32 at test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_cvtw2mask_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_cvtw2mask_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmovw2m %zmm0, %k0
-; AVX512F-32-NEXT:    kmovd %k0, %eax
-; AVX512F-32-NEXT:    vzeroupper
-; AVX512F-32-NEXT:    retl
+; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovw2m %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x29,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     %res = call i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16> %x0)
     ret i32 %res
 }
@@ -2641,21 +2634,21 @@ define i32 at test_int_x86_avx512_cvtw2mask
 declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2665,21 +2658,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2689,21 +2682,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2713,21 +2706,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2737,21 +2730,21 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1)
   %res2 = add <16 x i32> %res, %res1
@@ -2761,25 +2754,25 @@ define <16 x i32>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0]
+; X86-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0]
+; X86-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0]
+; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0]
+; X64-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -2791,23 +2784,23 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca]
+; X86-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca]
+; X64-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2817,23 +2810,23 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca]
+; X86-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca]
+; X64-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1
@@ -2843,23 +2836,23 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
-; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512F-32-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512F-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca]
+; X86-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca]
+; X64-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
   %res2 = add <32 x i16> %res, %res1

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll?rev=333843&r1=333842&r2=333843&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll Sun Jun  3 07:56:04 2018
@@ -1,35 +1,30 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW,AVX512BW-64
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW-32
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
 
 define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packs_epi32_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x6b,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
   ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
@@ -37,17 +32,17 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
@@ -55,36 +50,36 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
   ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x6b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
@@ -93,18 +88,18 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
@@ -113,16 +108,16 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rmb_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmb_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmb_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x58,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -131,20 +126,20 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rmbk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmbk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmbk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x59,0x6b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -155,18 +150,18 @@ define <32 x i16> @test_mask_packs_epi32
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi32_rmbkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi32_rmbkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi32_rmbkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xd9,0x6b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -179,33 +174,28 @@ define <32 x i16> @test_mask_packs_epi32
 declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>)
 
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi16_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packs_epi16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x63,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
   ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi16_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
   %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
@@ -213,17 +203,17 @@ define <64 x i8> @test_mask_packs_epi16_
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi16_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
   %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
@@ -231,36 +221,36 @@ define <64 x i8> @test_mask_packs_epi16_
 }
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi16_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x63,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x63,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
   ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi16_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x63,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
@@ -269,18 +259,18 @@ define <64 x i8> @test_mask_packs_epi16_
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packs_epi16_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packs_epi16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packs_epi16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x63,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
@@ -292,33 +282,28 @@ declare <64 x i8> @llvm.x86.avx512.packs
 
 
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packus_epi32_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x2b,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
   ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
@@ -326,17 +311,17 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
@@ -344,36 +329,36 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
   ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x2b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
@@ -382,18 +367,18 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <16 x i32>, <16 x i32>* %ptr_b
   %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
   %2 = bitcast i32 %mask to <32 x i1>
@@ -402,16 +387,16 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rmb_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmb_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x58,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmb_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x58,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -420,20 +405,20 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rmbk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmbk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x59,0x2b,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmbk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x59,0x2b,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -444,18 +429,18 @@ define <32 x i16> @test_mask_packus_epi3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi32_rmbkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi32_rmbkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xd9,0x2b,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi32_rmbkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xd9,0x2b,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -468,33 +453,28 @@ define <32 x i16> @test_mask_packus_epi3
 declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>)
 
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi16_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_packus_epi16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x67,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
   ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi16_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
   %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
@@ -502,17 +482,17 @@ define <64 x i8> @test_mask_packus_epi16
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi16_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
   %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
@@ -520,36 +500,36 @@ define <64 x i8> @test_mask_packus_epi16
 }
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi16_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x67,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x67,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
   ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi16_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x67,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
@@ -558,18 +538,18 @@ define <64 x i8> @test_mask_packus_epi16
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
-; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rsi, %k1
-; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_packus_epi16_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_packus_epi16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_packus_epi16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce]
+; X64-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x67,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
   %2 = bitcast i64 %mask to <64 x i1>
@@ -580,102 +560,97 @@ define <64 x i8> @test_mask_packus_epi16
 declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epi16_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_adds_epi16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epi16_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epi16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epi16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epi16_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epi16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epi16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_adds_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epi16_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epi16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epi16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epi16_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epi16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epi16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpaddsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epi16_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epi16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epi16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -684,102 +659,97 @@ define <32 x i16> @test_mask_adds_epi16_
 declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_subs_epi16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epi16_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_subs_epi16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epi16_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epi16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epi16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epi16_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epi16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epi16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_subs_epi16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epi16_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epi16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epi16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epi16_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epi16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epi16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpsubsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epi16_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epi16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epi16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -788,102 +758,97 @@ define <32 x i16> @test_mask_subs_epi16_
 declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epu16_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_adds_epu16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epu16_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epu16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epu16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpaddusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epu16_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epu16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epu16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epu16_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epu16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdd,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epu16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdd,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epu16_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epu16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddusw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epu16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpaddusw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xdd,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_adds_epu16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_adds_epu16_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_adds_epu16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpaddusw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_adds_epu16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpaddusw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xdd,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -892,102 +857,97 @@ define <32 x i16> @test_mask_adds_epu16_
 declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rr_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epu16_rr_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_mask_subs_epu16_rr_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rrk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epu16_rrk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epu16_rrk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epu16_rrk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsubusw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rrkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epu16_rrkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epu16_rrkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epu16_rrkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rm_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epu16_rm_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epu16_rm_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd9,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epu16_rm_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd9,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 -1)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rmk_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epu16_rmk_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epu16_rmk_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubusw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0x08]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epu16_rmk_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpsubusw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd9,0x0f]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask)
   ret <32 x i16> %res
 }
 
 define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
-; AVX512BW-LABEL: test_mask_subs_epu16_rmkz_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_mask_subs_epu16_rmkz_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_mask_subs_epu16_rmkz_512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    vpsubusw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0x00]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_mask_subs_epu16_rmkz_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpsubusw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd9,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %b = load <32 x i16>, <32 x i16>* %ptr_b
   %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> zeroinitializer, i32 %mask)
   ret <32 x i16> %res
@@ -996,23 +956,23 @@ define <32 x i16> @test_mask_subs_epu16_
 declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca]
+; X86-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca]
+; X64-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1
@@ -1022,23 +982,23 @@ define <32 x i16>@test_int_x86_avx512_ma
 }
 
 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca]
+; X86-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca]
+; X64-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
@@ -1050,23 +1010,23 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>)
 
 define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm0, %zmm3
-; AVX512BW-32-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X86-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca]
+; X86-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
+; X64-NEXT:    vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca]
+; X64-NEXT:    vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1
@@ -1078,33 +1038,28 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
 
 define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) {
-; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_pshuf_b_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_int_x86_avx512_pshuf_b_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1)
   ret <64 x i8> %res
 }
 
 define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_pshuf_b_512_mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1)
   %mask.cast = bitcast i64 %mask to <64 x i1>
   %res2 = select <64 x i1> %mask.cast, <64 x i8> %res, <64 x i8> %x2
@@ -1112,17 +1067,17 @@ define <64 x i8>@test_int_x86_avx512_psh
 }
 
 define <64 x i8>@test_int_x86_avx512_pshuf_b_512_maskz(<64 x i8> %x0, <64 x i8> %x1, i64 %mask) {
-; AVX512BW-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovq %rdi, %k1
-; AVX512BW-NEXT:    vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x00,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_pshuf_b_512_maskz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
+; X64-NEXT:    vpshufb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x00,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1)
   %mask.cast = bitcast i64 %mask to <64 x i1>
   %res2 = select <64 x i1> %mask.cast, <64 x i8> %res, <64 x i8> zeroinitializer
@@ -1132,21 +1087,21 @@ define <64 x i8>@test_int_x86_avx512_psh
 declare <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16> @test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2
@@ -1158,21 +1113,21 @@ define <32 x i16> @test_int_x86_avx512_m
 declare <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16> @test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2
@@ -1184,21 +1139,21 @@ define <32 x i16> @test_int_x86_avx512_m
 declare <32 x i16> @llvm.x86.avx512.pmul.hr.sw.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16> @test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2
@@ -1210,25 +1165,25 @@ define <32 x i16> @test_int_x86_avx512_m
 declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmovwb %zmm0, %ymm1 {%k1}
-; AVX512BW-32-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z}
-; AVX512BW-32-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512BW-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1]
+; X86-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc2]
+; X86-NEXT:    vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca]
+; X86-NEXT:    vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0]
+; X86-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1]
+; X64-NEXT:    vpmovwb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc2]
+; X64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca]
+; X64-NEXT:    vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0]
+; X64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1240,20 +1195,22 @@ define <32 x i8>@test_int_x86_avx512_mas
 declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi)
-; AVX512BW-NEXT:    vpmovwb %zmm0, (%rdi) {%k1}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpmovwb %zmm0, (%eax)
-; AVX512BW-32-NEXT:    vpmovwb %zmm0, (%eax) {%k1}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmovwb %zmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x48,0x30,0x00]
+; X86-NEXT:    vpmovwb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0x00]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpmovwb %zmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x48,0x30,0x07]
+; X64-NEXT:    vpmovwb %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1262,25 +1219,25 @@ define void @test_int_x86_avx512_mask_pm
 declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
-; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
-; AVX512BW-NEXT:    vpmovswb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmovswb %zmm0, %ymm1 {%k1}
-; AVX512BW-32-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z}
-; AVX512BW-32-NEXT:    vpmovswb %zmm0, %ymm0
-; AVX512BW-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0xc1]
+; X86-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc2]
+; X86-NEXT:    vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca]
+; X86-NEXT:    vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0]
+; X86-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmovswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0xc1]
+; X64-NEXT:    vpmovswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc2]
+; X64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca]
+; X64-NEXT:    vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0]
+; X64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1292,20 +1249,22 @@ define <32 x i8>@test_int_x86_avx512_mas
 declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi)
-; AVX512BW-NEXT:    vpmovswb %zmm0, (%rdi) {%k1}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpmovswb %zmm0, (%eax)
-; AVX512BW-32-NEXT:    vpmovswb %zmm0, (%eax) {%k1}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmovswb %zmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x48,0x20,0x00]
+; X86-NEXT:    vpmovswb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0x00]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpmovswb %zmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x48,0x20,0x07]
+; X64-NEXT:    vpmovswb %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1314,25 +1273,25 @@ define void @test_int_x86_avx512_mask_pm
 declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
-; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
-; AVX512BW-NEXT:    vpmovuswb %zmm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1}
-; AVX512BW-32-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z}
-; AVX512BW-32-NEXT:    vpmovuswb %zmm0, %ymm0
-; AVX512BW-32-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
-; AVX512BW-32-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0xc1]
+; X86-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc2]
+; X86-NEXT:    vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca]
+; X86-NEXT:    vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0]
+; X86-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmovuswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0xc1]
+; X64-NEXT:    vpmovuswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc2]
+; X64-NEXT:    vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca]
+; X64-NEXT:    vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0]
+; X64-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
     %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1)
     %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2)
     %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2)
@@ -1344,20 +1303,22 @@ define <32 x i8>@test_int_x86_avx512_mas
 declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32)
 
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_512(i8* %ptr, <32 x i16> %x1, i32 %x2) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %esi, %k1
-; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi)
-; AVX512BW-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512BW-32-NEXT:    vpmovuswb %zmm0, (%eax)
-; AVX512BW-32-NEXT:    vpmovuswb %zmm0, (%eax) {%k1}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
+; X86-NEXT:    vpmovuswb %zmm0, (%eax) # encoding: [0x62,0xf2,0x7e,0x48,0x10,0x00]
+; X86-NEXT:    vpmovuswb %zmm0, (%eax) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0x00]
+; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce]
+; X64-NEXT:    vpmovuswb %zmm0, (%rdi) # encoding: [0x62,0xf2,0x7e,0x48,0x10,0x07]
+; X64-NEXT:    vpmovuswb %zmm0, (%rdi) {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0x07]
+; X64-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; X64-NEXT:    retq # encoding: [0xc3]
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 -1)
     call void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16> %x1, i32 %x2)
     ret void
@@ -1366,21 +1327,21 @@ define void @test_int_x86_avx512_mask_pm
 declare <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8>, <64 x i8>)
 
 define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1]
+; X86-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1]
+; X64-NEXT:    vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2
@@ -1392,21 +1353,21 @@ define <32 x i16> @test_int_x86_avx512_m
 declare <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16>, <32 x i16>)
 
 define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpaddd %zmm3, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9]
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1]
+; X86-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1]
+; X64-NEXT:    vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <16 x i32> @llvm.x86.avx512.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
@@ -1418,25 +1379,25 @@ define <16 x i32> @test_int_x86_avx512_m
 declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z}
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
-; AVX512BW-32-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xd1,0x02]
+; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x02]
+; X86-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xc1,0x02]
+; X86-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x42,0xd1,0x02]
+; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x42,0xd9,0x02]
+; X64-NEXT:    vdbpsadbw $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x42,0xc1,0x02]
+; X64-NEXT:    vpaddw %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x65,0x48,0xfd,0xc0]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 -1)
@@ -1448,19 +1409,12 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>)
 
 define  <8 x i64>@test_int_x86_avx512_mask_psadb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2){
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psadb_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_psadb_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1
-; AVX512BW-32-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_int_x86_avx512_mask_psadb_w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsadbw %zmm1, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x7d,0x48,0xf6,0xc9]
+; CHECK-NEXT:    vpsadbw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xf6,0xc2]
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x1)
   %res1 = call  <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %x0, <64 x i8> %x2)
   %res2 = add  <8 x i64> %res, %res1
@@ -1470,25 +1424,25 @@ define  <8 x i64>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psrlv32hi:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_psrlv32hi:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psrlv32hi:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x10,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x10,0xd1]
+; X86-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x10,0xc1]
+; X86-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psrlv32hi:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x10,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x10,0xd1]
+; X64-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x10,0xc1]
+; X64-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -1500,25 +1454,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_psrav32_hi:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsravw %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsravw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psrav32_hi:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsravw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsravw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x11,0xd1]
+; X86-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x11,0xc1]
+; X86-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psrav32_hi:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsravw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsravw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x11,0xd1]
+; X64-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x11,0xc1]
+; X64-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -1528,17 +1482,23 @@ define <32 x i16>@test_int_x86_avx512_ma
 }
 
 define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi_const(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
-; AVX512BW-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
-; AVX512BW-32-NEXT:    vpsravw {{\.LCPI.*}}, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; X86-NEXT:    # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A]
+; X86-NEXT:    # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4
+; X86-NEXT:    vpsravw {{\.LCPI.*}}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A]
+; X86-NEXT:    # fixup A - offset: 6, value: {{\.LCPI.*}}, kind: FK_Data_4
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psrav32_hi_const:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51,2,9,65524,23,65510,37,65496,51]
+; X64-NEXT:    # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0x05,A,A,A,A]
+; X64-NEXT:    # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
+; X64-NEXT:    vpsravw {{.*}}(%rip), %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0x05,A,A,A,A]
+; X64-NEXT:    # fixup A - offset: 6, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16> <i16 2, i16 9,  i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9,  i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9,  i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51, i16 2, i16 9,  i16 -12, i16 23, i16 -26, i16 37, i16 -40, i16 51>,
                                                           <32 x i16> <i16 1, i16 10, i16 35,  i16 52, i16 69,  i16 9,  i16 16,  i16 49, i16 1, i16 10, i16 35,  i16 52, i16 69,  i16 9,  i16 16,  i16 49, i16 1, i16 10, i16 35,  i16 52, i16 69,  i16 9,  i16 16,  i16 49, i16 1, i16 10, i16 35,  i16 52, i16 69,  i16 9,  i16 16,  i16 49>,
                                                           <32 x i16> zeroinitializer, i32 -1)
@@ -1548,25 +1508,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_psllv32hi:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_psllv32hi:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsllvw %zmm1, %zmm0, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_psllv32hi:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllvw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x12,0xd9]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x12,0xd1]
+; X86-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x12,0xc1]
+; X86-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_psllv32hi:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllvw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x12,0xd9]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x12,0xd1]
+; X64-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x12,0xc1]
+; X64-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
   %res2 = call <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
@@ -1578,25 +1538,25 @@ define <32 x i16>@test_int_x86_avx512_ma
 declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
-; AVX512BW-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm3
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpermw %zmm0, %zmm1, %zmm3
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
-; AVX512BW-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8]
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0]
+; X86-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0]
+; X86-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X86-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8]
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0]
+; X64-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0]
+; X64-NEXT:    vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3]
+; X64-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
+; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1)
   %2 = bitcast i32 %x3 to <32 x i1>
   %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2
@@ -1610,49 +1570,44 @@ define <32 x i16>@test_int_x86_avx512_ma
 }
 
 define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) {
-; AVX512BW-LABEL: test_x86_avx512_psll_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_psll_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_x86_avx512_psll_w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xf1,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 define <32 x i16> @test_x86_avx512_mask_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_mask_psll_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_mask_psll_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_mask_psll_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf1,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psll_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf1,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
   ret <32 x i16> %res2
 }
 define <32 x i16> @test_x86_avx512_maskz_psll_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_maskz_psll_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_maskz_psll_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_maskz_psll_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xf1,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psll_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xf1,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
@@ -1662,49 +1617,44 @@ declare <32 x i16> @llvm.x86.avx512.psll
 
 
 define <32 x i16> @test_x86_avx512_pslli_w_512(<32 x i16> %a0) {
-; AVX512BW-LABEL: test_x86_avx512_pslli_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_pslli_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsllw $7, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_x86_avx512_pslli_w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllw $7, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x71,0xf0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 define <32 x i16> @test_x86_avx512_mask_pslli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_mask_pslli_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_mask_pslli_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsllw $7, %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_mask_pslli_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllw $7, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xf0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_pslli_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsllw $7, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xf0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
   ret <32 x i16> %res2
 }
 define <32 x i16> @test_x86_avx512_maskz_pslli_w_512(<32 x i16> %a0, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_maskz_pslli_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_maskz_pslli_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsllw $7, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_maskz_pslli_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsllw $7, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xf0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_pslli_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsllw $7, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xf0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
@@ -1714,49 +1664,44 @@ declare <32 x i16> @llvm.x86.avx512.psll
 
 
 define <32 x i16> @test_x86_avx512_psra_w_512(<32 x i16> %a0, <8 x i16> %a1) {
-; AVX512BW-LABEL: test_x86_avx512_psra_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_psra_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_x86_avx512_psra_w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe1,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 define <32 x i16> @test_x86_avx512_mask_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_mask_psra_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_mask_psra_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_mask_psra_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe1,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psra_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe1,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
   ret <32 x i16> %res2
 }
 define <32 x i16> @test_x86_avx512_maskz_psra_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_maskz_psra_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_maskz_psra_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_maskz_psra_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe1,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psra_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe1,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
@@ -1766,49 +1711,44 @@ declare <32 x i16> @llvm.x86.avx512.psra
 
 
 define <32 x i16> @test_x86_avx512_psrai_w_512(<32 x i16> %a0) {
-; AVX512BW-LABEL: test_x86_avx512_psrai_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsraw $7, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_psrai_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsraw $7, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_x86_avx512_psrai_w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsraw $7, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x71,0xe0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 define <32 x i16> @test_x86_avx512_mask_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_mask_psrai_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsraw $7, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_mask_psrai_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsraw $7, %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_mask_psrai_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsraw $7, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xe0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrai_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsraw $7, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xe0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
   ret <32 x i16> %res2
 }
 define <32 x i16> @test_x86_avx512_maskz_psrai_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_maskz_psrai_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsraw $7, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_maskz_psrai_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsraw $7, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_maskz_psrai_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsraw $7, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xe0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrai_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsraw $7, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xe0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
@@ -1818,49 +1758,44 @@ declare <32 x i16> @llvm.x86.avx512.psra
 
 
 define <32 x i16> @test_x86_avx512_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1) {
-; AVX512BW-LABEL: test_x86_avx512_psrl_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_psrl_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_x86_avx512_psrl_w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 define <32 x i16> @test_x86_avx512_mask_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, <32 x i16> %passthru, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_mask_psrl_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_mask_psrl_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm2, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_mask_psrl_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd1,0xd1]
+; X86-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrl_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xd1,0xd1]
+; X64-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
   ret <32 x i16> %res2
 }
 define <32 x i16> @test_x86_avx512_maskz_psrl_w_512(<32 x i16> %a0, <8 x i16> %a1, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_maskz_psrl_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_maskz_psrl_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_maskz_psrl_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd1,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrl_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xd1,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer
@@ -1870,49 +1805,44 @@ declare <32 x i16> @llvm.x86.avx512.psrl
 
 
 define <32 x i16> @test_x86_avx512_psrli_w_512(<32 x i16> %a0) {
-; AVX512BW-LABEL: test_x86_avx512_psrli_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_psrli_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    vpsrlw $7, %zmm0, %zmm0
-; AVX512BW-32-NEXT:    retl
+; CHECK-LABEL: test_x86_avx512_psrli_w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrlw $7, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x71,0xd0,0x07]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   ret <32 x i16> %res
 }
 define <32 x i16> @test_x86_avx512_mask_psrli_w_512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_mask_psrli_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_mask_psrli_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsrlw $7, %zmm0, %zmm1 {%k1}
-; AVX512BW-32-NEXT:    vmovdqa64 %zmm1, %zmm0
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_mask_psrli_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlw $7, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xd0,0x07]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_mask_psrli_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsrlw $7, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x75,0x49,0x71,0xd0,0x07]
+; X64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> %passthru
   ret <32 x i16> %res2
 }
 define <32 x i16> @test_x86_avx512_maskz_psrli_w_512(<32 x i16> %a0, i32 %mask) {
-; AVX512BW-LABEL: test_x86_avx512_maskz_psrli_w_512:
-; AVX512BW:       ## %bb.0:
-; AVX512BW-NEXT:    kmovd %edi, %k1
-; AVX512BW-NEXT:    vpsrlw $7, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    retq
-;
-; AVX512BW-32-LABEL: test_x86_avx512_maskz_psrli_w_512:
-; AVX512BW-32:       # %bb.0:
-; AVX512BW-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512BW-32-NEXT:    vpsrlw $7, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-32-NEXT:    retl
+; X86-LABEL: test_x86_avx512_maskz_psrli_w_512:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpsrlw $7, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xd0,0x07]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_x86_avx512_maskz_psrli_w_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
+; X64-NEXT:    vpsrlw $7, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0x71,0xd0,0x07]
+; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 7) ; <<32 x i16>> [#uses=1]
   %mask.cast = bitcast i32 %mask to <32 x i1>
   %res2 = select <32 x i1> %mask.cast, <32 x i16> %res, <32 x i16> zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll?rev=333843&r1=333842&r2=333843&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll Sun Jun  3 07:56:04 2018
@@ -1,23 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
 
 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlbw-builtins.c
 
 define zeroext i16 @test_mm_test_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_test_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestmb %xmm0, %xmm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_test_epi8_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestmb %xmm0, %xmm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_test_epi8_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestmb %xmm0, %xmm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -27,13 +20,13 @@ entry:
 }
 
 define zeroext i16 @test_mm_mask_test_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_mask_test_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestmb %xmm0, %xmm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_test_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestmb %xmm0, %xmm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_test_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -53,19 +46,12 @@ entry:
 }
 
 define i32 @test_mm256_test_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_test_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestmb %ymm0, %ymm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_test_epi8_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestmb %ymm0, %ymm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_test_epi8_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestmb %ymm0, %ymm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -75,13 +61,13 @@ entry:
 }
 
 define i32 @test_mm256_mask_test_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_mask_test_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestmb %ymm0, %ymm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_test_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestmb %ymm0, %ymm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_test_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -101,19 +87,12 @@ entry:
 }
 
 define zeroext i8 @test_mm_test_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_test_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestmw %xmm0, %xmm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_test_epi16_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestmw %xmm0, %xmm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_test_epi16_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestmw %xmm0, %xmm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -123,14 +102,14 @@ entry:
 }
 
 define zeroext i8 @test_mm_mask_test_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_mask_test_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vptestmw %xmm0, %xmm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_test_epi16_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vptestmw %xmm0, %xmm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_test_epi16_mask:
 ; X64:       # %bb.0: # %entry
@@ -150,21 +129,13 @@ entry:
 }
 
 define zeroext i16 @test_mm256_test_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_test_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestmw %ymm0, %ymm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_test_epi16_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestmw %ymm0, %ymm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_test_epi16_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestmw %ymm0, %ymm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -174,14 +145,14 @@ entry:
 }
 
 define zeroext i16 @test_mm256_mask_test_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_mask_test_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestmw %ymm0, %ymm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_test_epi16_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestmw %ymm0, %ymm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_test_epi16_mask:
 ; X64:       # %bb.0: # %entry
@@ -202,19 +173,12 @@ entry:
 }
 
 define zeroext i16 @test_mm_testn_epi8_mask(<2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_testn_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestnmb %xmm0, %xmm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testn_epi8_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestnmb %xmm0, %xmm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_testn_epi8_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestnmb %xmm0, %xmm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <16 x i8>
@@ -224,13 +188,13 @@ entry:
 }
 
 define zeroext i16 @test_mm_mask_testn_epi8_mask(i16 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_mask_testn_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestnmb %xmm0, %xmm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_testn_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestnmb %xmm0, %xmm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_testn_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -250,19 +214,12 @@ entry:
 }
 
 define i32 @test_mm256_testn_epi8_mask(<4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_testn_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestnmb %ymm0, %ymm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testn_epi8_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestnmb %ymm0, %ymm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_testn_epi8_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestnmb %ymm0, %ymm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <32 x i8>
@@ -272,13 +229,13 @@ entry:
 }
 
 define i32 @test_mm256_mask_testn_epi8_mask(i32 %__U, <4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_mask_testn_epi8_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestnmb %ymm0, %ymm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_testn_epi8_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestnmb %ymm0, %ymm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_testn_epi8_mask:
 ; X64:       # %bb.0: # %entry
@@ -298,19 +255,12 @@ entry:
 }
 
 define zeroext i8 @test_mm_testn_epi16_mask(<2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_testn_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestnmw %xmm0, %xmm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_testn_epi16_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestnmw %xmm0, %xmm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    movzbl %al, %eax
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_testn_epi16_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestnmw %xmm0, %xmm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <2 x i64> %__B, %__A
   %0 = bitcast <2 x i64> %and.i.i to <8 x i16>
@@ -320,14 +270,14 @@ entry:
 }
 
 define zeroext i8 @test_mm_mask_testn_epi16_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) {
-; X32-LABEL: test_mm_mask_testn_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vptestnmw %xmm0, %xmm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzbl %al, %eax
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_testn_epi16_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vptestnmw %xmm0, %xmm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_testn_epi16_mask:
 ; X64:       # %bb.0: # %entry
@@ -347,21 +297,13 @@ entry:
 }
 
 define zeroext i16 @test_mm256_testn_epi16_mask(<4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_testn_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vptestnmw %ymm0, %ymm1, %k0
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_testn_epi16_mask:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vptestnmw %ymm0, %ymm1, %k0
-; X64-NEXT:    kmovd %k0, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_testn_epi16_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vptestnmw %ymm0, %ymm1, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %and.i.i = and <4 x i64> %__B, %__A
   %0 = bitcast <4 x i64> %and.i.i to <16 x i16>
@@ -371,14 +313,14 @@ entry:
 }
 
 define zeroext i16 @test_mm256_mask_testn_epi16_mask(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_mask_testn_epi16_mask:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vptestnmw %ymm0, %ymm1, %k0 {%k1}
-; X32-NEXT:    kmovd %k0, %eax
-; X32-NEXT:    movzwl %ax, %eax
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_testn_epi16_mask:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vptestnmw %ymm0, %ymm1, %k0 {%k1}
+; X86-NEXT:    kmovd %k0, %eax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_testn_epi16_mask:
 ; X64:       # %bb.0: # %entry
@@ -399,12 +341,12 @@ entry:
 }
 
 define <2 x i64> @test_mm_mask_set1_epi8(<2 x i64> %__O, i16 zeroext %__M, i8 signext %__A) local_unnamed_addr #0 {
-; X32-LABEL: test_mm_mask_set1_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %eax, %xmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_set1_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %eax, %xmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_set1_epi8:
 ; X64:       # %bb.0: # %entry
@@ -422,12 +364,12 @@ entry:
 }
 
 define <2 x i64> @test_mm_maskz_set1_epi8(i16 zeroext %__M, i8 signext %__A)  {
-; X32-LABEL: test_mm_maskz_set1_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %eax, %xmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_maskz_set1_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %eax, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskz_set1_epi8:
 ; X64:       # %bb.0: # %entry
@@ -444,12 +386,12 @@ entry:
 }
 
 define <4 x i64> @test_mm256_mask_set1_epi8(<4 x i64> %__O, i32 %__M, i8 signext %__A){
-; X32-LABEL: test_mm256_mask_set1_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %eax, %ymm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_set1_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %eax, %ymm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_set1_epi8:
 ; X64:       # %bb.0: # %entry
@@ -467,12 +409,12 @@ entry:
 }
 
 define <4 x i64> @test_mm256_maskz_set1_epi8(i32 %__M, i8 signext %__A)  {
-; X32-LABEL: test_mm256_maskz_set1_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %eax, %ymm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_maskz_set1_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %eax, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskz_set1_epi8:
 ; X64:       # %bb.0: # %entry
@@ -489,12 +431,12 @@ entry:
 }
 
 define <4 x i64> @test_mm256_mask_set1_epi16(<4 x i64> %__O, i16 zeroext %__M, i16 signext %__A)  {
-; X32-LABEL: test_mm256_mask_set1_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %eax, %ymm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_set1_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %eax, %ymm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_set1_epi16:
 ; X64:       # %bb.0: # %entry
@@ -512,12 +454,12 @@ entry:
 }
 
 define <4 x i64> @test_mm256_maskz_set1_epi16(i16 zeroext %__M, i16 signext %__A) {
-; X32-LABEL: test_mm256_maskz_set1_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %eax, %ymm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_maskz_set1_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %eax, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskz_set1_epi16:
 ; X64:       # %bb.0: # %entry
@@ -534,13 +476,13 @@ entry:
 }
 
 define <2 x i64> @test_mm_mask_set1_epi16(<2 x i64> %__O, i8 zeroext %__M, i16 signext %__A) {
-; X32-LABEL: test_mm_mask_set1_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpbroadcastw %eax, %xmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_set1_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    kmovd %ecx, %k1
+; X86-NEXT:    vpbroadcastw %eax, %xmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_set1_epi16:
 ; X64:       # %bb.0: # %entry
@@ -558,13 +500,13 @@ entry:
 }
 
 define <2 x i64> @test_mm_maskz_set1_epi16(i8 zeroext %__M, i16 signext %__A) {
-; X32-LABEL: test_mm_maskz_set1_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
-; X32-NEXT:    kmovd %ecx, %k1
-; X32-NEXT:    vpbroadcastw %eax, %xmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_maskz_set1_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    kmovd %ecx, %k1
+; X86-NEXT:    vpbroadcastw %eax, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskz_set1_epi16:
 ; X64:       # %bb.0: # %entry
@@ -582,15 +524,10 @@ entry:
 
 
 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpbroadcastb %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_broadcastb_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastb %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_broadcastb_epi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
   %res1 = bitcast <16 x i8> %res0 to <2 x i64>
@@ -598,11 +535,11 @@ define <2 x i64> @test_mm_broadcastb_epi
 }
 
 define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm_mask_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %xmm1, %xmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_broadcastb_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %xmm1, %xmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_broadcastb_epi8:
 ; X64:       # %bb.0:
@@ -619,11 +556,11 @@ define <2 x i64> @test_mm_mask_broadcast
 }
 
 define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_maskz_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_maskz_broadcastb_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskz_broadcastb_epi8:
 ; X64:       # %bb.0:
@@ -639,15 +576,10 @@ define <2 x i64> @test_mm_maskz_broadcas
 }
 
 define <4 x i64> @test_mm256_broadcastb_epi8(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    vpbroadcastb %xmm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_broadcastb_epi8:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastb %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_broadcastb_epi8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %res0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <32 x i32> zeroinitializer
   %res1 = bitcast <32 x i8> %res0 to <4 x i64>
@@ -655,11 +587,11 @@ define <4 x i64> @test_mm256_broadcastb_
 }
 
 define <4 x i64> @test_mm256_mask_broadcastb_epi8(<4 x i64> %a0, i32 %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm256_mask_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %xmm1, %ymm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_broadcastb_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %xmm1, %ymm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_broadcastb_epi8:
 ; X64:       # %bb.0:
@@ -676,11 +608,11 @@ define <4 x i64> @test_mm256_mask_broadc
 }
 
 define <4 x i64> @test_mm256_maskz_broadcastb_epi8(i32 %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_maskz_broadcastb_epi8:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastb %xmm0, %ymm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_maskz_broadcastb_epi8:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastb %xmm0, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskz_broadcastb_epi8:
 ; X64:       # %bb.0:
@@ -696,15 +628,10 @@ define <4 x i64> @test_mm256_maskz_broad
 }
 
 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_broadcastw_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_broadcastw_epi16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
   %res1 = bitcast <8 x i16> %res0 to <2 x i64>
@@ -712,12 +639,12 @@ define <2 x i64> @test_mm_broadcastw_epi
 }
 
 define <2 x i64> @test_mm_mask_broadcastw_epi16(<2 x i64> %a0, i8 %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm_mask_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpbroadcastw %xmm1, %xmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_broadcastw_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpbroadcastw %xmm1, %xmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_broadcastw_epi16:
 ; X64:       # %bb.0:
@@ -734,12 +661,12 @@ define <2 x i64> @test_mm_mask_broadcast
 }
 
 define <2 x i64> @test_mm_maskz_broadcastw_epi16(i8 %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm_maskz_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_maskz_broadcastw_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskz_broadcastw_epi16:
 ; X64:       # %bb.0:
@@ -755,15 +682,10 @@ define <2 x i64> @test_mm_maskz_broadcas
 }
 
 define <4 x i64> @test_mm256_broadcastw_epi16(<2 x i64> %a0) {
-; X32-LABEL: test_mm256_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    vpbroadcastw %xmm0, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_broadcastw_epi16:
-; X64:       # %bb.0:
-; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_broadcastw_epi16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
   %res0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <16 x i32> zeroinitializer
   %res1 = bitcast <16 x i16> %res0 to <4 x i64>
@@ -771,11 +693,11 @@ define <4 x i64> @test_mm256_broadcastw_
 }
 
 define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x i64> %a2) {
-; X32-LABEL: test_mm256_mask_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %xmm1, %ymm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_broadcastw_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %xmm1, %ymm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_broadcastw_epi16:
 ; X64:       # %bb.0:
@@ -792,11 +714,11 @@ define <4 x i64> @test_mm256_mask_broadc
 }
 
 define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
-; X32-LABEL: test_mm256_maskz_broadcastw_epi16:
-; X32:       # %bb.0:
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_maskz_broadcastw_epi16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskz_broadcastw_epi16:
 ; X64:       # %bb.0:
@@ -812,17 +734,11 @@ define <4 x i64> @test_mm256_maskz_broad
 }
 
 define <2 x i64> @test_mm256_cvtepi16_epi8(<4 x i64> %__A) {
-; X32-LABEL: test_mm256_cvtepi16_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vpmovwb %ymm0, %xmm0
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_cvtepi16_epi8:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpmovwb %ymm0, %xmm0
-; X64-NEXT:    vzeroupper
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_cvtepi16_epi8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpmovwb %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %0 = bitcast <4 x i64> %__A to <16 x i16>
   %conv.i = trunc <16 x i16> %0 to <16 x i8>
@@ -831,12 +747,12 @@ entry:
 }
 
 define <2 x i64> @test_mm256_mask_cvtepi16_epi8(<2 x i64> %__O, i16 zeroext %__M, <4 x i64> %__A) {
-; X32-LABEL: test_mm256_mask_cvtepi16_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpmovwb %ymm1, %xmm0 {%k1}
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_cvtepi16_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpmovwb %ymm1, %xmm0 {%k1}
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_cvtepi16_epi8:
 ; X64:       # %bb.0: # %entry
@@ -855,12 +771,12 @@ entry:
 }
 
 define <2 x i64> @test_mm256_maskz_cvtepi16_epi8(i16 zeroext %__M, <4 x i64> %__A) {
-; X32-LABEL: test_mm256_maskz_cvtepi16_epi8:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpmovwb %ymm0, %xmm0 {%k1} {z}
-; X32-NEXT:    vzeroupper
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_maskz_cvtepi16_epi8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpmovwb %ymm0, %xmm0 {%k1} {z}
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskz_cvtepi16_epi8:
 ; X64:       # %bb.0: # %entry
@@ -878,13 +794,13 @@ entry:
 }
 
 define <2 x i64> @test_mm_mask2_permutex2var_epi16(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) {
-; X32-LABEL: test_mm_mask2_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermi2w %xmm2, %xmm0, %xmm1 {%k1}
-; X32-NEXT:    vmovdqa %xmm1, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask2_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermi2w %xmm2, %xmm0, %xmm1 {%k1}
+; X86-NEXT:    vmovdqa %xmm1, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask2_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry
@@ -904,12 +820,12 @@ entry:
 }
 
 define <4 x i64> @test_mm256_mask2_permutex2var_epi16(<4 x i64> %__A, <4 x i64> %__I, i16 zeroext %__U, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_mask2_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1 {%k1}
-; X32-NEXT:    vmovdqa %ymm1, %ymm0
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask2_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1 {%k1}
+; X86-NEXT:    vmovdqa %ymm1, %ymm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask2_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry
@@ -929,15 +845,10 @@ entry:
 }
 
 define <2 x i64> @test_mm_permutex2var_epi16(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
-; X32-LABEL: test_mm_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vpermt2w %xmm2, %xmm1, %xmm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm_permutex2var_epi16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpermt2w %xmm2, %xmm1, %xmm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm_permutex2var_epi16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpermt2w %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %0 = bitcast <2 x i64> %__A to <8 x i16>
   %1 = bitcast <2 x i64> %__I to <8 x i16>
@@ -948,12 +859,12 @@ entry:
 }
 
 define <2 x i64> @test_mm_mask_permutex2var_epi16(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) {
-; X32-LABEL: test_mm_mask_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermt2w %xmm2, %xmm1, %xmm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_mask_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermt2w %xmm2, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry
@@ -972,12 +883,12 @@ entry:
 }
 
 define <2 x i64> @test_mm_maskz_permutex2var_epi16(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) {
-; X32-LABEL: test_mm_maskz_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vpermt2w %xmm2, %xmm1, %xmm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm_maskz_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    vpermt2w %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskz_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry
@@ -996,15 +907,10 @@ entry:
 }
 
 define <4 x i64> @test_mm256_permutex2var_epi16(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0
-; X32-NEXT:    retl
-;
-; X64-LABEL: test_mm256_permutex2var_epi16:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0
-; X64-NEXT:    retq
+; CHECK-LABEL: test_mm256_permutex2var_epi16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
 entry:
   %0 = bitcast <4 x i64> %__A to <16 x i16>
   %1 = bitcast <4 x i64> %__I to <16 x i16>
@@ -1015,11 +921,11 @@ entry:
 }
 
 define <4 x i64> @test_mm256_mask_permutex2var_epi16(<4 x i64> %__A, i16 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_mask_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0 {%k1}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_mask_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0 {%k1}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry
@@ -1038,11 +944,11 @@ entry:
 }
 
 define <4 x i64> @test_mm256_maskz_permutex2var_epi16(i16 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) {
-; X32-LABEL: test_mm256_maskz_permutex2var_epi16:
-; X32:       # %bb.0: # %entry
-; X32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
-; X32-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0 {%k1} {z}
-; X32-NEXT:    retl
+; X86-LABEL: test_mm256_maskz_permutex2var_epi16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpermt2w %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskz_permutex2var_epi16:
 ; X64:       # %bb.0: # %entry




More information about the llvm-commits mailing list