[llvm] r333549 - [X86][AVX512] Replace -cpu=knl with -mattr=+avx512f for avx512-intrinsics tests

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed May 30 07:36:41 PDT 2018


Author: rksimon
Date: Wed May 30 07:36:41 2018
New Revision: 333549

URL: http://llvm.org/viewvc/llvm-project?rev=333549&view=rev
Log:
[X86][AVX512] Replace -cpu=knl with -mattr=+avx512f for avx512-intrinsics tests

It was noticed on D47377 that these tests were being unnecessarily affected by scheduler changes.

This adds vzeroupper at the end of some tests as we lose the 'FeatureFastPartialYMMorZMMWrite' feature from KNL, since Skylake+ don't support this its probably better.

Modified:
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll?rev=333549&r1=333548&r2=333549&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll Wed May 30 07:36:41 2018
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
 
 declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 
@@ -22,8 +22,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm1
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpbroadcastd %edi, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
     %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
@@ -42,8 +42,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
    %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
@@ -106,9 +106,9 @@ define <16 x i32>@test_int_x86_avx512_pb
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddd %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vpbroadcastd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
   %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
@@ -126,9 +126,9 @@ define <8 x i64>@test_int_x86_avx512_pbr
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vpbroadcastq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
   %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
@@ -227,8 +227,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4]
 ; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4]
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
@@ -244,6 +244,7 @@ define void @test_store1(<16 x float> %d
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovups %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
@@ -258,6 +259,7 @@ define void @test_store2(<8 x double> %d
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovupd %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
@@ -272,6 +274,7 @@ define void @test_mask_store_aligned_ps(
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovaps %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
@@ -286,6 +289,7 @@ define void @test_mask_store_aligned_pd(
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovapd %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
@@ -300,6 +304,7 @@ define void at test_int_x86_avx512_mask_sto
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
   call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
@@ -314,6 +319,7 @@ define void at test_int_x86_avx512_mask_sto
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
   call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
@@ -328,6 +334,7 @@ define void at test_int_x86_avx512_mask_sto
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
   call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
@@ -342,6 +349,7 @@ define void at test_int_x86_avx512_mask_sto
 ; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovdqa32 %zmm0, (%rdi) {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
   call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
@@ -543,8 +551,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
 ; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12]
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 	%res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
 	%res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
@@ -560,6 +568,7 @@ define i16 @test_pcmpeq_d(<16 x i32> %a,
 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
@@ -572,6 +581,7 @@ define i16 @test_mask_pcmpeq_d(<16 x i32
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
@@ -585,6 +595,7 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
@@ -597,6 +608,7 @@ define i8 @test_mask_pcmpeq_q(<8 x i64>
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
@@ -610,6 +622,7 @@ define i16 @test_pcmpgt_d(<16 x i32> %a,
 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
@@ -622,6 +635,7 @@ define i16 @test_mask_pcmpgt_d(<16 x i32
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
@@ -635,6 +649,7 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <
 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
@@ -647,6 +662,7 @@ define i8 @test_mask_pcmpgt_q(<8 x i64>
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    andb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
@@ -726,9 +742,9 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -984,6 +1000,7 @@ define void at test_storent_q_512(<8 x i64>
 ; CHECK-LABEL: test_storent_q_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data)
   ret void
@@ -995,6 +1012,7 @@ define void @test_storent_pd_512(<8 x do
 ; CHECK-LABEL: test_storent_pd_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data)
   ret void
@@ -1006,6 +1024,7 @@ define void @test_storent_ps_512(<16 x f
 ; CHECK-LABEL: test_storent_ps_512:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vmovntps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.storent.ps.512(i8* %ptr, <16 x float> %data)
   ret void
@@ -1955,8 +1974,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -1975,8 +1994,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -1995,8 +2014,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2015,8 +2034,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -2035,8 +2054,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2055,8 +2074,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -2075,8 +2094,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpmovsxbq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2095,8 +2114,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2116,8 +2135,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
@@ -2137,8 +2156,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpmovsxwq %xmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
@@ -2156,8 +2175,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpsrlq $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
@@ -2176,8 +2195,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpsrld $4, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
@@ -2197,8 +2216,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpsrad $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
@@ -2217,8 +2236,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpsraq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
@@ -2237,8 +2256,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpslld $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpslld $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
@@ -2257,8 +2276,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vpsllq $3, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
@@ -3007,6 +3026,7 @@ define <4 x float> @test_mask_vextractf3
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
   ret <4 x float> %res
@@ -3031,6 +3051,7 @@ define <4 x i32> @test_maskz_vextracti32
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
   ret <4 x i32> %res
@@ -3077,9 +3098,9 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
@@ -3117,9 +3138,9 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
@@ -3164,6 +3185,7 @@ define <8 x i16> @test_cmp_d_512(<16 x i
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3208,6 +3230,7 @@ define <8 x i16> @test_mask_cmp_d_512(<1
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3254,6 +3277,7 @@ define <8 x i16> @test_ucmp_d_512(<16 x
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3298,6 +3322,7 @@ define <8 x i16> @test_mask_ucmp_d_512(<
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3344,6 +3369,7 @@ define <8 x i8> @test_cmp_q_512(<8 x i64
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    movl $255, %eax
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3388,6 +3414,7 @@ define <8 x i8> @test_mask_cmp_q_512(<8
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3434,6 +3461,7 @@ define <8 x i8> @test_ucmp_q_512(<8 x i6
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    movl $255, %eax
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3478,6 +3506,7 @@ define <8 x i8> @test_mask_ucmp_q_512(<8
 ; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
   %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
@@ -3579,8 +3608,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
 
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
@@ -3612,9 +3641,9 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
 ; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
 
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
@@ -3677,6 +3706,7 @@ define i8 @test_vptestmq(<8 x i64> %a0,
 ; CHECK-NEXT:    andb %al, %dil
 ; CHECK-NEXT:    addb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
   %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
@@ -3693,6 +3723,7 @@ define i16 @test_vptestmd(<16 x i32> %a0
 ; CHECK-NEXT:    andl %eax, %edi
 ; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
   %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
@@ -3711,6 +3742,7 @@ define i16 at test_int_x86_avx512_ptestnm_d
 ; CHECK-NEXT:    andl %eax, %edi
 ; CHECK-NEXT:    addl %edi, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
   %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
@@ -3728,6 +3760,7 @@ define i8 at test_int_x86_avx512_ptestnm_q_
 ; CHECK-NEXT:    andb %al, %dil
 ; CHECK-NEXT:    addb %dil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
@@ -3822,6 +3855,7 @@ define i32 @test_kortestz(<8 x i64> %A,
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    kortestw %k1, %k0
 ; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %A to <16 x i32>
@@ -3845,6 +3879,7 @@ define i32 @test_kortestc(<8 x i64> %A,
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    kortestw %k1, %k0
 ; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %A to <16 x i32>
@@ -3865,6 +3900,7 @@ define i16 @test_cmpps(<16 x float> %a,
 ; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
   ret i16 %res
@@ -3877,6 +3913,7 @@ define i8 @test_cmppd(<8 x double> %a, <
 ; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
   ret i8 %res
@@ -4181,8 +4218,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1}
 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -4221,8 +4258,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1}
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=333549&r1=333548&r2=333549&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Wed May 30 07:36:41 2018
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
 
 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
 ; CHECK-LABEL: test_rcp_ps_512:
@@ -651,6 +651,7 @@ define i16 @test_cmpps(<16 x float> %a,
 ; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i32 8)
   %1 = bitcast <16 x i1> %res to i16
@@ -664,6 +665,7 @@ define i8 @test_cmppd(<8 x double> %a, <
 ; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i32 4)
   %1 = bitcast <8 x i1> %res to i8
@@ -2164,8 +2166,9 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
@@ -2183,6 +2186,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmov.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2198,8 +2202,9 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
@@ -2217,6 +2222,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovsqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2232,8 +2238,9 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
@@ -2251,6 +2258,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovusqb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2266,8 +2274,9 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
@@ -2285,6 +2294,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmov.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2300,8 +2310,9 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
@@ -2319,6 +2330,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovsqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2334,8 +2346,9 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
+; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
     %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
@@ -2353,6 +2366,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovusqw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2367,9 +2381,9 @@ define <8 x i32>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
@@ -2387,6 +2401,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2401,9 +2416,9 @@ define <8 x i32>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
@@ -2421,6 +2436,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovsqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2435,9 +2451,9 @@ define <8 x i32>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
     %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
     %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
@@ -2455,6 +2471,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovusqd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 -1)
     call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64> %x1, i8 %x2)
@@ -2470,8 +2487,9 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
@@ -2489,6 +2507,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovdb %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
     call void @llvm.x86.avx512.mask.pmov.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
@@ -2504,8 +2523,9 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
@@ -2523,6 +2543,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovsdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
     call void @llvm.x86.avx512.mask.pmovs.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
@@ -2538,8 +2559,9 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm0
+; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
     %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
@@ -2557,6 +2579,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovusdb %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
     call void @llvm.x86.avx512.mask.pmovus.db.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
@@ -2571,9 +2594,9 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
@@ -2591,6 +2614,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovdw %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
     call void @llvm.x86.avx512.mask.pmov.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
@@ -2605,9 +2629,9 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
@@ -2625,6 +2649,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovsdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
     call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
@@ -2639,9 +2664,9 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm2 {%k1} {z}
 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1
 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
     %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
     %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
@@ -2659,6 +2684,7 @@ define void @test_int_x86_avx512_mask_pm
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi)
 ; CHECK-NEXT:    vpmovusdw %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
     call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 -1)
     call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(i8* %ptr, <16 x i32> %x1, i16 %x2)
@@ -2936,9 +2962,9 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kmovw %k0, %esi
 ; CHECK-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    orb %cl, %dl
 ; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    orb %dl, %al
+; CHECK-NEXT:    orb %cl, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 
@@ -2981,9 +3007,9 @@ define i8 at test_int_x86_avx512_mask_cmp_s
 ; CHECK-NEXT:    kmovw %k0, %esi
 ; CHECK-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andb %cl, %dl
 ; CHECK-NEXT:    andb %sil, %al
 ; CHECK-NEXT:    andb %dl, %al
+; CHECK-NEXT:    andb %cl, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
@@ -3388,12 +3414,12 @@ declare <16 x i32> @llvm.x86.avx512.mask
 define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
@@ -3408,12 +3434,12 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -3431,8 +3457,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vprold $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
@@ -3451,8 +3477,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
 ; CHECK-NEXT:    vprolq $3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
@@ -3496,8 +3522,8 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1}
 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
   %2 = bitcast i8 %x3 to <8 x i1>
@@ -3544,8 +3570,8 @@ define <16 x i32>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1}
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
   %2 = bitcast i16 %x3 to <16 x i1>




More information about the llvm-commits mailing list