[llvm] r315175 - [X86][SKX] Adding the scheduling information for the SKX target.

Gadi Haber via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 8 05:52:54 PDT 2017


Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Sun Oct  8 05:52:54 2017
@@ -5,22 +5,22 @@
 define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
 ; CHECK-LABEL: test_16xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:0.50]
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
 ; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -29,12 +29,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
 ; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -42,13 +42,13 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
 ; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -57,12 +57,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
 ; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -70,13 +70,13 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
 ; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -85,12 +85,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
 ; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -98,22 +98,22 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
 ; CHECK-LABEL: test_16xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:0.50]
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
 ; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -122,12 +122,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
 ; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -135,9 +135,9 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_16xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   ret <16 x i16> %res
@@ -145,12 +145,12 @@ define <16 x i16> @test_16xi16_perm_mem_
 define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
 ; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -160,12 +160,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
 ; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -175,12 +175,12 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
 ; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -190,12 +190,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
 ; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -205,12 +205,12 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
 ; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -220,12 +220,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
 ; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -235,9 +235,9 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_16xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   ret <16 x i16> %res
@@ -245,12 +245,12 @@ define <16 x i16> @test_16xi16_perm_mem_
 define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
 ; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -260,12 +260,12 @@ define <16 x i16> @test_masked_16xi16_pe
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
 ; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -275,22 +275,22 @@ define <16 x i16> @test_masked_z_16xi16_
 define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [5:0.50]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
 ; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -299,12 +299,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
 ; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -312,13 +312,13 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
 ; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -327,12 +327,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
 ; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -340,13 +340,13 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
 ; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -355,12 +355,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
 ; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -368,22 +368,22 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [5:0.50]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
 ; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1} # sched: [6:2.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -392,12 +392,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
 ; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [6:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -405,9 +405,9 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [5:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
   ret <32 x i16> %res
@@ -415,12 +415,12 @@ define <32 x i16> @test_32xi16_perm_mem_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
 ; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -430,12 +430,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
 ; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -445,12 +445,12 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
 ; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -460,12 +460,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
 ; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -475,12 +475,12 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
 ; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -490,12 +490,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
 ; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -505,9 +505,9 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [5:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
   ret <32 x i16> %res
@@ -515,12 +515,12 @@ define <32 x i16> @test_32xi16_perm_mem_
 define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
 ; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -530,12 +530,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
 ; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [13:2.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -545,21 +545,21 @@ define <32 x i16> @test_masked_z_32xi16_
 define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
 ; CHECK-LABEL: test_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-53, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -568,11 +568,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-53, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -580,12 +580,12 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-89, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -594,11 +594,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,1,2,6,0,0,3] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-89, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -606,12 +606,12 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -620,11 +620,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,5,5,1,7,3,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -632,21 +632,21 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
 ; CHECK-LABEL: test_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
 ; CHECK-NEXT:    movb $47, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -655,11 +655,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
 ; CHECK-NEXT:    movb $47, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -667,9 +667,9 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [1:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   ret <8 x i32> %res
@@ -677,11 +677,11 @@ define <8 x i32> @test_8xi32_perm_mem_ma
 define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -691,11 +691,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -705,11 +705,11 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $89, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -719,11 +719,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,6,1,7,6,7,6,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $89, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -733,11 +733,11 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
 ; CHECK-NEXT:    movb $98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -747,11 +747,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,4,6,1,6,3,6,3] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
 ; CHECK-NEXT:    movb $98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -761,9 +761,9 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [1:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   ret <8 x i32> %res
@@ -771,11 +771,11 @@ define <8 x i32> @test_8xi32_perm_mem_ma
 define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -785,11 +785,11 @@ define <8 x i32> @test_masked_8xi32_perm
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -799,22 +799,22 @@ define <8 x i32> @test_masked_z_8xi32_pe
 define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [5:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-28063, %ax # imm = 0x9261
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -823,12 +823,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-28063, %ax # imm = 0x9261
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -836,13 +836,13 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
 ; CHECK-NEXT:    movw $14154, %ax # imm = 0x374A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -851,12 +851,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
 ; CHECK-NEXT:    movw $14154, %ax # imm = 0x374A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -864,13 +864,13 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
 ; CHECK-NEXT:    movw $6126, %ax # imm = 0x17EE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -879,12 +879,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
 ; CHECK-NEXT:    movw $6126, %ax # imm = 0x17EE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -892,22 +892,22 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
 ; CHECK-LABEL: test_16xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [5:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-11837, %ax # imm = 0xD1C3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -916,12 +916,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-11837, %ax # imm = 0xD1C3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -929,9 +929,9 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_16xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [5:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   ret <16 x i32> %res
@@ -939,12 +939,12 @@ define <16 x i32> @test_16xi32_perm_mem_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
 ; CHECK-NEXT:    movw $19075, %ax # imm = 0x4A83
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -954,12 +954,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
 ; CHECK-NEXT:    movw $19075, %ax # imm = 0x4A83
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -969,12 +969,12 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
 ; CHECK-NEXT:    movw $27511, %ax # imm = 0x6B77
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -984,12 +984,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
 ; CHECK-NEXT:    movw $27511, %ax # imm = 0x6B77
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -999,12 +999,12 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
 ; CHECK-NEXT:    movw $3032, %ax # imm = 0xBD8
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -1014,12 +1014,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
 ; CHECK-NEXT:    movw $3032, %ax # imm = 0xBD8
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -1029,9 +1029,9 @@ define <16 x i32> @test_masked_z_16xi32_
 define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_16xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [5:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   ret <16 x i32> %res
@@ -1039,12 +1039,12 @@ define <16 x i32> @test_16xi32_perm_mem_
 define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
 ; CHECK-NEXT:    movw $8666, %ax # imm = 0x21DA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -1054,12 +1054,12 @@ define <16 x i32> @test_masked_16xi32_pe
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
 ; CHECK-NEXT:    movw $8666, %ax # imm = 0x21DA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -1070,7 +1070,7 @@ define <4 x i64> @test_4xi64_perm_mask0(
 ; CHECK-LABEL: test_4xi64_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   ret <4 x i64> %res
 }
@@ -1078,10 +1078,10 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
@@ -1091,9 +1091,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -1102,10 +1102,10 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
@@ -1115,9 +1115,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -1126,10 +1126,10 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
@@ -1139,9 +1139,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -1150,7 +1150,7 @@ define <4 x i64> @test_4xi64_perm_mask3(
 ; CHECK-LABEL: test_4xi64_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   ret <4 x i64> %res
 }
@@ -1158,10 +1158,10 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
   ret <4 x i64> %res
@@ -1171,9 +1171,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -1181,8 +1181,8 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
 ; CHECK-LABEL: test_4xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   ret <4 x i64> %res
@@ -1191,9 +1191,9 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1204,9 +1204,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -1217,9 +1217,9 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1230,9 +1230,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -1243,9 +1243,9 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1256,9 +1256,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -1268,8 +1268,8 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
 ; CHECK-LABEL: test_4xi64_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   ret <4 x i64> %res
@@ -1278,9 +1278,9 @@ define <4 x i64> @test_masked_4xi64_perm
 ; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1291,9 +1291,9 @@ define <4 x i64> @test_masked_z_4xi64_pe
 ; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -1303,21 +1303,21 @@ define <4 x i64> @test_masked_z_4xi64_pe
 define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
 ; CHECK-LABEL: test_8xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1326,11 +1326,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1339,10 +1339,10 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-122, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1352,9 +1352,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-122, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1362,12 +1362,12 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
 ; CHECK-NEXT:    movb $17, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1376,11 +1376,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,3,7,3,3,5,4,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
 ; CHECK-NEXT:    movb $17, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1388,8 +1388,8 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
 ; CHECK-LABEL: test_8xi64_perm_imm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   ret <8 x i64> %res
 }
@@ -1397,10 +1397,10 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1410,9 +1410,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1420,12 +1420,12 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1434,11 +1434,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,3,1,1,7,4,0,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1447,10 +1447,10 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-67, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1460,9 +1460,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-67, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1470,21 +1470,21 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
 ; CHECK-LABEL: test_8xi64_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1493,11 +1493,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1506,10 +1506,10 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
   ret <8 x i64> %res
@@ -1519,9 +1519,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -1529,9 +1529,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_8xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   ret <8 x i64> %res
@@ -1539,11 +1539,11 @@ define <8 x i64> @test_8xi64_perm_mem_ma
 define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1553,11 +1553,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1568,9 +1568,9 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $125, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1581,9 +1581,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $125, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1593,11 +1593,11 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-77, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1607,11 +1607,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,1,4,1,1,5,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-77, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1621,8 +1621,8 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   ret <8 x i64> %res
@@ -1631,9 +1631,9 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $55, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1644,9 +1644,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $55, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1656,11 +1656,11 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
 ; CHECK-NEXT:    movb $68, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1670,11 +1670,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,0,7,0,3,5,0,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
 ; CHECK-NEXT:    movb $68, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1685,9 +1685,9 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1698,9 +1698,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1710,9 +1710,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_8xi64_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   ret <8 x i64> %res
@@ -1720,11 +1720,11 @@ define <8 x i64> @test_8xi64_perm_mem_ma
 define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
 ; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1734,11 +1734,11 @@ define <8 x i64> @test_masked_8xi64_perm
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
 ; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1749,9 +1749,9 @@ define <8 x i64> @test_masked_8xi64_perm
 ; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1762,9 +1762,9 @@ define <8 x i64> @test_masked_z_8xi64_pe
 ; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -1774,21 +1774,21 @@ define <8 x i64> @test_masked_z_8xi64_pe
 define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
 ; CHECK-LABEL: test_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
 ; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
@@ -1797,11 +1797,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
 ; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -1809,12 +1809,12 @@ define <8 x float> @test_masked_z_8xfloa
 define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-34, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
@@ -1823,11 +1823,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,1,0,6,0,5,1] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-34, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -1835,12 +1835,12 @@ define <8 x float> @test_masked_z_8xfloa
 define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-18, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
@@ -1849,11 +1849,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,5,5,4,6,0,5] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-18, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -1861,21 +1861,21 @@ define <8 x float> @test_masked_z_8xfloa
 define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
 ; CHECK-LABEL: test_8xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
 ; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec2
   ret <8 x float> %res
@@ -1884,11 +1884,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
 ; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -1896,9 +1896,9 @@ define <8 x float> @test_masked_z_8xfloa
 define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
 ; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [1:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   ret <8 x float> %res
@@ -1906,11 +1906,11 @@ define <8 x float> @test_8xfloat_perm_me
 define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
 ; CHECK-NEXT:    movb $61, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
@@ -1920,11 +1920,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
 ; CHECK-NEXT:    movb $61, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -1934,11 +1934,11 @@ define <8 x float> @test_masked_z_8xfloa
 define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-124, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
@@ -1948,11 +1948,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [1,3,7,4,0,6,6,6] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-124, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -1962,11 +1962,11 @@ define <8 x float> @test_masked_z_8xfloa
 define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-84, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
@@ -1976,11 +1976,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [4,5,1,5,6,6,2,4] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
 ; CHECK-NEXT:    movb $-84, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -1990,9 +1990,9 @@ define <8 x float> @test_masked_z_8xfloa
 define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp) {
 ; CHECK-LABEL: test_8xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [1:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   ret <8 x float> %res
@@ -2000,11 +2000,11 @@ define <8 x float> @test_8xfloat_perm_me
 define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2) {
 ; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
 ; CHECK-NEXT:    movb $60, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
@@ -2014,11 +2014,11 @@ define <8 x float> @test_masked_8xfloat_
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [1:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
 ; CHECK-NEXT:    movb $60, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -2028,22 +2028,22 @@ define <8 x float> @test_masked_z_8xfloa
 define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [5:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
 ; CHECK-NEXT:    movw $14423, %ax # imm = 0x3857
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
@@ -2052,12 +2052,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
 ; CHECK-NEXT:    movw $14423, %ax # imm = 0x3857
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -2065,13 +2065,13 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-22757, %ax # imm = 0xA71B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
@@ -2080,12 +2080,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-22757, %ax # imm = 0xA71B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -2093,13 +2093,13 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-22227, %ax # imm = 0xA92D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
@@ -2108,12 +2108,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-22227, %ax # imm = 0xA92D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -2121,22 +2121,22 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
 ; CHECK-LABEL: test_16xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [5:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
 ; CHECK-NEXT:    movw $32420, %ax # imm = 0x7EA4
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec2
   ret <16 x float> %res
@@ -2145,12 +2145,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
 ; CHECK-NEXT:    movw $32420, %ax # imm = 0x7EA4
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -2158,9 +2158,9 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
 ; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [5:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   ret <16 x float> %res
@@ -2168,12 +2168,12 @@ define <16 x float> @test_16xfloat_perm_
 define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
 ; CHECK-NEXT:    movw $1441, %ax # imm = 0x5A1
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
@@ -2183,12 +2183,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
 ; CHECK-NEXT:    movw $1441, %ax # imm = 0x5A1
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -2198,12 +2198,12 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-12684, %ax # imm = 0xCE74
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec2
@@ -2213,12 +2213,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-12684, %ax # imm = 0xCE74
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -2228,12 +2228,12 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
 ; CHECK-NEXT:    movw $11066, %ax # imm = 0x2B3A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
@@ -2243,12 +2243,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
 ; CHECK-NEXT:    movw $11066, %ax # imm = 0x2B3A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -2258,9 +2258,9 @@ define <16 x float> @test_masked_z_16xfl
 define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
 ; CHECK-LABEL: test_16xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [5:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   ret <16 x float> %res
@@ -2268,12 +2268,12 @@ define <16 x float> @test_16xfloat_perm_
 define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2) {
 ; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-13916, %ax # imm = 0xC9A4
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec2
@@ -2283,12 +2283,12 @@ define <16 x float> @test_masked_16xfloa
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [5:0.50]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
 ; CHECK-NEXT:    movw $-13916, %ax # imm = 0xC9A4
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -2299,7 +2299,7 @@ define <4 x double> @test_4xdouble_perm_
 ; CHECK-LABEL: test_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   ret <4 x double> %res
 }
@@ -2307,10 +2307,10 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
@@ -2320,9 +2320,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -2331,10 +2331,10 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
@@ -2344,9 +2344,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -2355,10 +2355,10 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
@@ -2368,9 +2368,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -2379,7 +2379,7 @@ define <4 x double> @test_4xdouble_perm_
 ; CHECK-LABEL: test_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   ret <4 x double> %res
 }
@@ -2387,10 +2387,10 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
   ret <4 x double> %res
@@ -2400,9 +2400,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -2410,8 +2410,8 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
 ; CHECK-LABEL: test_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x double> %res
@@ -2420,9 +2420,9 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec2
@@ -2433,9 +2433,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -2446,9 +2446,9 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
@@ -2459,9 +2459,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -2472,9 +2472,9 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
@@ -2485,9 +2485,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -2497,8 +2497,8 @@ define <4 x double> @test_masked_z_4xdou
 define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
 ; CHECK-LABEL: test_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   ret <4 x double> %res
@@ -2507,9 +2507,9 @@ define <4 x double> @test_masked_4xdoubl
 ; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
@@ -2520,9 +2520,9 @@ define <4 x double> @test_masked_z_4xdou
 ; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -2532,21 +2532,21 @@ define <4 x double> @test_masked_z_4xdou
 define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
 ; CHECK-LABEL: test_8xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2555,11 +2555,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2568,10 +2568,10 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2581,9 +2581,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2591,12 +2591,12 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
 ; CHECK-NEXT:    movb $49, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2605,11 +2605,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [7,5,5,5,3,5,1,7] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
 ; CHECK-NEXT:    movb $49, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2617,8 +2617,8 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
 ; CHECK-LABEL: test_8xdouble_perm_imm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   ret <8 x double> %res
 }
@@ -2626,10 +2626,10 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-57, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2639,9 +2639,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-57, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2649,12 +2649,12 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-54, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2663,11 +2663,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [3,5,3,4,6,5,7,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-54, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2676,10 +2676,10 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2689,9 +2689,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2699,21 +2699,21 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
 ; CHECK-LABEL: test_8xdouble_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-65, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1} # sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2722,11 +2722,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-65, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2735,10 +2735,10 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $40, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
   ret <8 x double> %res
@@ -2748,9 +2748,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $40, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -2758,9 +2758,9 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
 ; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   ret <8 x double> %res
@@ -2768,11 +2768,11 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
 ; CHECK-NEXT:    movb $99, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
@@ -2782,11 +2782,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
 ; CHECK-NEXT:    movb $99, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -2797,9 +2797,9 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-32, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
@@ -2810,9 +2810,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-32, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -2822,11 +2822,11 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
@@ -2836,11 +2836,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [6,7,2,7,7,6,2,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -2850,8 +2850,8 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
 ; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   ret <8 x double> %res
@@ -2860,9 +2860,9 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $119, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
@@ -2873,9 +2873,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $119, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -2885,11 +2885,11 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-45, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
@@ -2899,11 +2899,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [1,1,3,5,6,0,6,0] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-45, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -2914,9 +2914,9 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
@@ -2927,9 +2927,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -2939,9 +2939,9 @@ define <8 x double> @test_masked_z_8xdou
 define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
 ; CHECK-LABEL: test_8xdouble_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [5:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   ret <8 x double> %res
@@ -2949,11 +2949,11 @@ define <8 x double> @test_8xdouble_perm_
 define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2) {
 ; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-75, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
@@ -2963,11 +2963,11 @@ define <8 x double> @test_masked_8xdoubl
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [5:0.50]
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
 ; CHECK-NEXT:    movb $-75, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -2978,9 +2978,9 @@ define <8 x double> @test_masked_8xdoubl
 ; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $84, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
@@ -2991,9 +2991,9 @@ define <8 x double> @test_masked_z_8xdou
 ; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $84, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -3003,8 +3003,8 @@ define <8 x double> @test_masked_z_8xdou
 define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
 ; CHECK-LABEL: test_16xi8_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   ret <16 x i8> %res
 }
@@ -3013,10 +3013,10 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> %vec2
   ret <16 x i8> %res
@@ -3027,9 +3027,9 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
@@ -3039,10 +3039,10 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> %vec2
   ret <16 x i8> %res
@@ -3053,9 +3053,9 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
@@ -3065,10 +3065,10 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
   ret <16 x i8> %res
@@ -3079,9 +3079,9 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
@@ -3089,8 +3089,8 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
 ; CHECK-LABEL: test_16xi8_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   ret <16 x i8> %res
 }
@@ -3099,10 +3099,10 @@ define <16 x i8> @test_masked_16xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
   ret <16 x i8> %res
@@ -3113,9 +3113,9 @@ define <16 x i8> @test_masked_z_16xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
@@ -3123,9 +3123,9 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
 ; CHECK-LABEL: test_16xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   ret <16 x i8> %res
@@ -3133,12 +3133,12 @@ define <16 x i8> @test_16xi8_perm_mem_ma
 define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3148,12 +3148,12 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i8> %shuf, <16 x i8> zeroinitializer
@@ -3163,12 +3163,12 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3178,12 +3178,12 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
@@ -3193,12 +3193,12 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3208,12 +3208,12 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
@@ -3223,9 +3223,9 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
 ; CHECK-LABEL: test_16xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   ret <16 x i8> %res
@@ -3233,12 +3233,12 @@ define <16 x i8> @test_16xi8_perm_mem_ma
 define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3248,12 +3248,12 @@ define <16 x i8> @test_masked_16xi8_perm
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
 ; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
@@ -3263,8 +3263,8 @@ define <16 x i8> @test_masked_z_16xi8_pe
 define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
 ; CHECK-LABEL: test_32xi8_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   ret <32 x i8> %res
 }
@@ -3273,10 +3273,10 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
   ret <32 x i8> %res
@@ -3287,9 +3287,9 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
@@ -3299,10 +3299,10 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> %vec2
   ret <32 x i8> %res
@@ -3313,9 +3313,9 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
@@ -3325,10 +3325,10 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
   ret <32 x i8> %res
@@ -3339,9 +3339,9 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
@@ -3349,8 +3349,8 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
 ; CHECK-LABEL: test_32xi8_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   ret <32 x i8> %res
 }
@@ -3359,10 +3359,10 @@ define <32 x i8> @test_masked_32xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
   ret <32 x i8> %res
@@ -3373,9 +3373,9 @@ define <32 x i8> @test_masked_z_32xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
@@ -3383,9 +3383,9 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
 ; CHECK-LABEL: test_32xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   ret <32 x i8> %res
@@ -3393,12 +3393,12 @@ define <32 x i8> @test_32xi8_perm_mem_ma
 define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3408,12 +3408,12 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
@@ -3423,12 +3423,12 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3438,12 +3438,12 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i8> %shuf, <32 x i8> zeroinitializer
@@ -3453,12 +3453,12 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3468,12 +3468,12 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> zeroinitializer
@@ -3483,9 +3483,9 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
 ; CHECK-LABEL: test_32xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   ret <32 x i8> %res
@@ -3493,12 +3493,12 @@ define <32 x i8> @test_32xi8_perm_mem_ma
 define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3508,12 +3508,12 @@ define <32 x i8> @test_masked_32xi8_perm
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
 ; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
@@ -3523,8 +3523,8 @@ define <32 x i8> @test_masked_z_32xi8_pe
 define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
 ; CHECK-LABEL: test_64xi8_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   ret <64 x i8> %res
 }
@@ -3533,10 +3533,10 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $3680399704764602881, %rax # imm = 0x3313680829F25A01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
   ret <64 x i8> %res
@@ -3547,9 +3547,9 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $3680399704764602881, %rax # imm = 0x3313680829F25A01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
@@ -3559,10 +3559,10 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $3029806472256067585, %rax # imm = 0x2A0C08EF15009801
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
   ret <64 x i8> %res
@@ -3573,9 +3573,9 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $3029806472256067585, %rax # imm = 0x2A0C08EF15009801
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
@@ -3585,10 +3585,10 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $1110016799796225, %rax # imm = 0x3F18DED0BEC01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
   ret <64 x i8> %res
@@ -3599,9 +3599,9 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $1110016799796225, %rax # imm = 0x3F18DED0BEC01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
@@ -3609,8 +3609,8 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
 ; CHECK-LABEL: test_64xi8_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   ret <64 x i8> %res
 }
@@ -3619,10 +3619,10 @@ define <64 x i8> @test_masked_64xi8_perm
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $839183534234450945, %rax # imm = 0xBA560FA6B66BC01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
   ret <64 x i8> %res
@@ -3633,9 +3633,9 @@ define <64 x i8> @test_masked_z_64xi8_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movabsq $839183534234450945, %rax # imm = 0xBA560FA6B66BC01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
@@ -3643,9 +3643,9 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
 ; CHECK-LABEL: test_64xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   ret <64 x i8> %res
@@ -3653,12 +3653,12 @@ define <64 x i8> @test_64xi8_perm_mem_ma
 define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $3164984076108002305, %rax # imm = 0x2BEC483F982F7401
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3668,12 +3668,12 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $3164984076108002305, %rax # imm = 0x2BEC483F982F7401
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
@@ -3683,12 +3683,12 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $3421658227176024577, %rax # imm = 0x2F7C2C07659EAA01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3698,12 +3698,12 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $3421658227176024577, %rax # imm = 0x2F7C2C07659EAA01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
@@ -3713,12 +3713,12 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $3085252902658394625, %rax # imm = 0x2AD1052B29324A01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3728,12 +3728,12 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $3085252902658394625, %rax # imm = 0x2AD1052B29324A01
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
@@ -3743,9 +3743,9 @@ define <64 x i8> @test_masked_z_64xi8_pe
 define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
 ; CHECK-LABEL: test_64xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   ret <64 x i8> %res
@@ -3753,12 +3753,12 @@ define <64 x i8> @test_64xi8_perm_mem_ma
 define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2) {
 ; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $29622951609754113, %rax # imm = 0x693DEAE3E5E201
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3768,12 +3768,12 @@ define <64 x i8> @test_masked_64xi8_perm
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
 ; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
 ; CHECK-NEXT:    movabsq $29622951609754113, %rax # imm = 0x693DEAE3E5E201
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovq %rax, %k1
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovq %rax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
@@ -3784,7 +3784,7 @@ define <8 x i16> @test_8xi16_perm_high_m
 ; CHECK-LABEL: test_8xi16_perm_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   ret <8 x i16> %res
 }
@@ -3792,10 +3792,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-82, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3805,9 +3805,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-82, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3816,10 +3816,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $43, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3829,9 +3829,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $43, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3840,10 +3840,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $20, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3853,9 +3853,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $20, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3864,7 +3864,7 @@ define <8 x i16> @test_8xi16_perm_low_ma
 ; CHECK-LABEL: test_8xi16_perm_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -3872,10 +3872,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-20, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3885,9 +3885,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-20, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3896,10 +3896,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3909,9 +3909,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3920,10 +3920,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3933,9 +3933,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3944,7 +3944,7 @@ define <8 x i16> @test_8xi16_perm_high_m
 ; CHECK-LABEL: test_8xi16_perm_high_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   ret <8 x i16> %res
 }
@@ -3952,10 +3952,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $117, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3965,9 +3965,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $117, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3976,10 +3976,10 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $39, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
   ret <8 x i16> %res
@@ -3989,9 +3989,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $39, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
@@ -3999,8 +3999,8 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
 ; CHECK-LABEL: test_8xi16_perm_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   ret <8 x i16> %res
@@ -4009,9 +4009,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-83, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4022,9 +4022,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-83, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4035,9 +4035,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4048,9 +4048,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4061,9 +4061,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4074,9 +4074,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4086,8 +4086,8 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
 ; CHECK-LABEL: test_8xi16_perm_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
@@ -4096,9 +4096,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $74, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4109,9 +4109,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $74, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4122,9 +4122,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4135,9 +4135,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4148,9 +4148,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $53, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4161,9 +4161,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $53, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4173,8 +4173,8 @@ define <8 x i16> @test_masked_z_8xi16_pe
 define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
 ; CHECK-LABEL: test_8xi16_perm_high_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   ret <8 x i16> %res
@@ -4183,9 +4183,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-121, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4196,9 +4196,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-121, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4209,9 +4209,9 @@ define <8 x i16> @test_masked_8xi16_perm
 ; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $87, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4222,9 +4222,9 @@ define <8 x i16> @test_masked_z_8xi16_pe
 ; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $87, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
@@ -4235,7 +4235,7 @@ define <16 x i16> @test_16xi16_perm_high
 ; CHECK-LABEL: test_16xi16_perm_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   ret <16 x i16> %res
 }
@@ -4244,10 +4244,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3495, %ax # imm = 0xF259
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4258,9 +4258,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3495, %ax # imm = 0xF259
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4270,10 +4270,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11903, %ax # imm = 0xD181
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4284,9 +4284,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11903, %ax # imm = 0xD181
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4296,10 +4296,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-14510, %ax # imm = 0xC752
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4310,9 +4310,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-14510, %ax # imm = 0xC752
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4321,7 +4321,7 @@ define <16 x i16> @test_16xi16_perm_low_
 ; CHECK-LABEL: test_16xi16_perm_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %res
 }
@@ -4330,10 +4330,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-16563, %ax # imm = 0xBF4D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4344,9 +4344,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-16563, %ax # imm = 0xBF4D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4356,10 +4356,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12298, %ax # imm = 0x300A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4370,9 +4370,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $12298, %ax # imm = 0x300A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4382,10 +4382,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-29565, %ax # imm = 0x8C83
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4396,9 +4396,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-29565, %ax # imm = 0x8C83
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4407,7 +4407,7 @@ define <16 x i16> @test_16xi16_perm_high
 ; CHECK-LABEL: test_16xi16_perm_high_mask6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   ret <16 x i16> %res
 }
@@ -4416,10 +4416,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27779, %ax # imm = 0x6C83
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4430,9 +4430,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27779, %ax # imm = 0x6C83
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4442,10 +4442,10 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3292, %ax # imm = 0xF324
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
   ret <16 x i16> %res
@@ -4456,9 +4456,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3292, %ax # imm = 0xF324
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
@@ -4466,8 +4466,8 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_16xi16_perm_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   ret <16 x i16> %res
@@ -4477,9 +4477,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12838, %ax # imm = 0xCDDA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4491,9 +4491,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12838, %ax # imm = 0xCDDA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4505,9 +4505,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $14962, %ax # imm = 0x3A72
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4519,9 +4519,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $14962, %ax # imm = 0x3A72
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4533,9 +4533,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $1029, %ax # imm = 0x405
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4547,9 +4547,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $1029, %ax # imm = 0x405
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4559,8 +4559,8 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_16xi16_perm_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %res
@@ -4570,9 +4570,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30862, %ax # imm = 0x8772
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4584,9 +4584,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30862, %ax # imm = 0x8772
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4598,9 +4598,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3845, %ax # imm = 0xF0FB
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4612,9 +4612,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3845, %ax # imm = 0xF0FB
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4626,9 +4626,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-20955, %ax # imm = 0xAE25
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4640,9 +4640,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-20955, %ax # imm = 0xAE25
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4652,8 +4652,8 @@ define <16 x i16> @test_masked_z_16xi16_
 define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
 ; CHECK-LABEL: test_16xi16_perm_high_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   ret <16 x i16> %res
@@ -4663,9 +4663,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-24190, %ax # imm = 0xA182
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4677,9 +4677,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-24190, %ax # imm = 0xA182
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4691,9 +4691,9 @@ define <16 x i16> @test_masked_16xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-24392, %ax # imm = 0xA0B8
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4705,9 +4705,9 @@ define <16 x i16> @test_masked_z_16xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-24392, %ax # imm = 0xA0B8
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
@@ -4717,8 +4717,8 @@ define <16 x i16> @test_masked_z_16xi16_
 define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_perm_high_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   ret <32 x i16> %res
 }
@@ -4727,10 +4727,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1671867126, %eax # imm = 0x63A6AAF6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4741,9 +4741,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1671867126, %eax # imm = 0x63A6AAF6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4753,10 +4753,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-514766311, %eax # imm = 0xE1514A19
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4767,9 +4767,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-514766311, %eax # imm = 0xE1514A19
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4779,10 +4779,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $165000787, %eax # imm = 0x9D5B653
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4793,9 +4793,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $165000787, %eax # imm = 0x9D5B653
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4803,8 +4803,8 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_perm_low_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i16> %res
 }
@@ -4813,10 +4813,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1998504075, %eax # imm = 0x771EC08B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4827,9 +4827,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1998504075, %eax # imm = 0x771EC08B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4839,10 +4839,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-730778639, %eax # imm = 0xD47133F1
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4853,9 +4853,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-730778639, %eax # imm = 0xD47133F1
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4865,10 +4865,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $544659762, %eax # imm = 0x2076D932
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4879,9 +4879,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $544659762, %eax # imm = 0x2076D932
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4889,8 +4889,8 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
 ; CHECK-LABEL: test_32xi16_perm_high_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   ret <32 x i16> %res
 }
@@ -4899,10 +4899,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1243446456, %eax # imm = 0xB5E28348
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   %res = select <32 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4913,9 +4913,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1243446456, %eax # imm = 0xB5E28348
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   %res = select <32 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4925,10 +4925,10 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1409246810, %eax # imm = 0x53FF665A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
   ret <32 x i16> %res
@@ -4939,9 +4939,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1409246810, %eax # imm = 0x53FF665A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
@@ -4949,8 +4949,8 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_perm_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   ret <32 x i16> %res
@@ -4960,9 +4960,9 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1911488810, %eax # imm = 0x8E10FED6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -4974,9 +4974,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1911488810, %eax # imm = 0x8E10FED6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -4988,9 +4988,9 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1098876619, %eax # imm = 0xBE807935
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5002,9 +5002,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1098876619, %eax # imm = 0xBE807935
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -5016,9 +5016,9 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1583892148, %eax # imm = 0xA197B94C
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5030,9 +5030,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1583892148, %eax # imm = 0xA197B94C
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -5042,8 +5042,8 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_perm_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i16> %res
@@ -5053,9 +5053,9 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-216128444, %eax # imm = 0xF31E2444
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5067,9 +5067,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-216128444, %eax # imm = 0xF31E2444
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -5081,9 +5081,9 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1480468153, %eax # imm = 0x583E26B9
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5095,9 +5095,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $1480468153, %eax # imm = 0x583E26B9
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -5107,12 +5107,12 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2) {
 ; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; CHECK-NEXT:    movl $-1778617447, %eax # imm = 0x95FC7399
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5122,12 +5122,12 @@ define <32 x i16> @test_masked_32xi16_pe
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
 ; CHECK-NEXT:    movl $-1778617447, %eax # imm = 0x95FC7399
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -5137,8 +5137,8 @@ define <32 x i16> @test_masked_z_32xi16_
 define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
 ; CHECK-LABEL: test_32xi16_perm_high_mem_mask6:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   ret <32 x i16> %res
@@ -5148,9 +5148,9 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $355619267, %eax # imm = 0x153251C3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5162,9 +5162,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $355619267, %eax # imm = 0x153251C3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -5176,9 +5176,9 @@ define <32 x i16> @test_masked_32xi16_pe
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1890659259, %eax # imm = 0x8F4ED445
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5190,9 +5190,9 @@ define <32 x i16> @test_masked_z_32xi16_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl $-1890659259, %eax # imm = 0x8F4ED445
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
@@ -5203,7 +5203,7 @@ define <4 x i32> @test_4xi32_perm_mask0(
 ; CHECK-LABEL: test_4xi32_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   ret <4 x i32> %res
 }
@@ -5211,10 +5211,10 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
@@ -5224,9 +5224,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
@@ -5235,10 +5235,10 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
@@ -5248,9 +5248,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
@@ -5259,10 +5259,10 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
@@ -5272,9 +5272,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
@@ -5283,7 +5283,7 @@ define <4 x i32> @test_4xi32_perm_mask3(
 ; CHECK-LABEL: test_4xi32_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   ret <4 x i32> %res
 }
@@ -5291,10 +5291,10 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
   ret <4 x i32> %res
@@ -5304,9 +5304,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
@@ -5314,8 +5314,8 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
 ; CHECK-LABEL: test_4xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   ret <4 x i32> %res
@@ -5324,9 +5324,9 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5337,9 +5337,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
@@ -5350,9 +5350,9 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5363,9 +5363,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
@@ -5376,9 +5376,9 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5389,9 +5389,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
@@ -5401,8 +5401,8 @@ define <4 x i32> @test_masked_z_4xi32_pe
 define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
 ; CHECK-LABEL: test_4xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   ret <4 x i32> %res
@@ -5411,9 +5411,9 @@ define <4 x i32> @test_masked_4xi32_perm
 ; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5424,9 +5424,9 @@ define <4 x i32> @test_masked_z_4xi32_pe
 ; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
@@ -5437,7 +5437,7 @@ define <8 x i32> @test2_8xi32_perm_mask0
 ; CHECK-LABEL: test2_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   ret <8 x i32> %res
 }
@@ -5445,10 +5445,10 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-99, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -5458,9 +5458,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-99, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -5469,10 +5469,10 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-90, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -5482,9 +5482,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-90, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -5493,10 +5493,10 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -5506,9 +5506,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -5517,7 +5517,7 @@ define <8 x i32> @test2_8xi32_perm_mask3
 ; CHECK-LABEL: test2_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   ret <8 x i32> %res
 }
@@ -5525,10 +5525,10 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $116, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
   ret <8 x i32> %res
@@ -5538,9 +5538,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $116, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -5548,8 +5548,8 @@ define <8 x i32> @test2_masked_z_8xi32_p
 define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
 ; CHECK-LABEL: test2_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   ret <8 x i32> %res
@@ -5558,9 +5558,9 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5571,9 +5571,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -5584,9 +5584,9 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-97, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5597,9 +5597,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-97, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -5610,9 +5610,9 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $73, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5623,9 +5623,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $73, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -5635,8 +5635,8 @@ define <8 x i32> @test2_masked_z_8xi32_p
 define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
 ; CHECK-LABEL: test2_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   ret <8 x i32> %res
@@ -5645,9 +5645,9 @@ define <8 x i32> @test2_masked_8xi32_per
 ; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5658,9 +5658,9 @@ define <8 x i32> @test2_masked_z_8xi32_p
 ; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -5670,8 +5670,8 @@ define <8 x i32> @test2_masked_z_8xi32_p
 define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
 ; CHECK-LABEL: test2_16xi32_perm_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   ret <16 x i32> %res
 }
@@ -5680,10 +5680,10 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18453, %ax # imm = 0x4815
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -5694,9 +5694,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18453, %ax # imm = 0x4815
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -5706,10 +5706,10 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $11142, %ax # imm = 0x2B86
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -5720,9 +5720,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $11142, %ax # imm = 0x2B86
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -5732,10 +5732,10 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15610, %ax # imm = 0x3CFA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -5746,9 +5746,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15610, %ax # imm = 0x3CFA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -5756,8 +5756,8 @@ define <16 x i32> @test2_masked_z_16xi32
 define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
 ; CHECK-LABEL: test2_16xi32_perm_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   ret <16 x i32> %res
 }
@@ -5766,10 +5766,10 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $14814, %ax # imm = 0x39DE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
   ret <16 x i32> %res
@@ -5780,9 +5780,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $14814, %ax # imm = 0x39DE
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -5790,8 +5790,8 @@ define <16 x i32> @test2_masked_z_16xi32
 define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
 ; CHECK-LABEL: test2_16xi32_perm_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   ret <16 x i32> %res
@@ -5801,9 +5801,9 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $7334, %ax # imm = 0x1CA6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5815,9 +5815,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $7334, %ax # imm = 0x1CA6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -5829,9 +5829,9 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-25463, %ax # imm = 0x9C89
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5843,9 +5843,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-25463, %ax # imm = 0x9C89
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -5857,9 +5857,9 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-14529, %ax # imm = 0xC73F
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5871,9 +5871,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-14529, %ax # imm = 0xC73F
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -5883,8 +5883,8 @@ define <16 x i32> @test2_masked_z_16xi32
 define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
 ; CHECK-LABEL: test2_16xi32_perm_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   ret <16 x i32> %res
@@ -5894,9 +5894,9 @@ define <16 x i32> @test2_masked_16xi32_p
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21392, %ax # imm = 0xAC70
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5908,9 +5908,9 @@ define <16 x i32> @test2_masked_z_16xi32
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21392, %ax # imm = 0xAC70
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -5921,7 +5921,7 @@ define <8 x float> @test2_8xfloat_shuff_
 ; CHECK-LABEL: test2_8xfloat_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
@@ -5930,9 +5930,9 @@ define <8 x float> @test2_8xfloat_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -5943,9 +5943,9 @@ define <8 x float> @test2_8xfloat_zero_m
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -5955,9 +5955,9 @@ define <8 x float> @test2_8xfloat_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $-63, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -5968,9 +5968,9 @@ define <8 x float> @test2_8xfloat_zero_m
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $-63, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -5980,9 +5980,9 @@ define <8 x float> @test2_8xfloat_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $107, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -5993,9 +5993,9 @@ define <8 x float> @test2_8xfloat_zero_m
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $107, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -6004,7 +6004,7 @@ define <8 x float> @test2_8xfloat_shuff_
 ; CHECK-LABEL: test2_8xfloat_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
@@ -6013,9 +6013,9 @@ define <8 x float> @test2_8xfloat_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $66, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -6026,9 +6026,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $66, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -6036,8 +6036,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x float> %res
@@ -6045,11 +6045,11 @@ define <8 x float> @test_8xfloat_shuff_m
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -6059,11 +6059,11 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -6073,11 +6073,11 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -6087,11 +6087,11 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -6101,11 +6101,11 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-50, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -6115,11 +6115,11 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-50, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -6129,8 +6129,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
@@ -6138,11 +6138,11 @@ define <8 x float> @test_8xfloat_shuff_m
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
 ; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-26, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -6152,11 +6152,11 @@ define <8 x float> @test_8xfloat_masked_
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-26, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -6166,8 +6166,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
 ; CHECK-LABEL: test_16xfloat_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x float> %res
 }
@@ -6176,10 +6176,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -6190,9 +6190,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -6202,10 +6202,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -6216,9 +6216,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -6227,10 +6227,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $75, %ax # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -6240,9 +6240,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $75, %ax # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -6250,8 +6250,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
 ; CHECK-LABEL: test_16xfloat_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   ret <16 x float> %res
 }
@@ -6260,10 +6260,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -6274,9 +6274,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -6284,8 +6284,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; CHECK-LABEL: test_16xfloat_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   ret <16 x float> %res
@@ -6295,10 +6295,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -6310,9 +6310,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -6324,10 +6324,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -6339,9 +6339,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -6353,10 +6353,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -6368,9 +6368,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -6380,8 +6380,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; CHECK-LABEL: test_16xfloat_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   ret <16 x float> %res
@@ -6391,10 +6391,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -6406,9 +6406,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -6419,7 +6419,7 @@ define <4 x double> @test_4xdouble_shuff
 ; CHECK-LABEL: test_4xdouble_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x double> %res
 }
@@ -6428,9 +6428,9 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -6441,9 +6441,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -6453,9 +6453,9 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -6466,9 +6466,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -6478,9 +6478,9 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -6491,9 +6491,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -6502,7 +6502,7 @@ define <4 x double> @test_4xdouble_shuff
 ; CHECK-LABEL: test_4xdouble_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
 }
@@ -6511,9 +6511,9 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -6524,9 +6524,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -6534,8 +6534,8 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
@@ -6543,11 +6543,11 @@ define <4 x double> @test_4xdouble_shuff
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -6557,11 +6557,11 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -6571,11 +6571,11 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -6585,11 +6585,11 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -6599,11 +6599,11 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
@@ -6613,11 +6613,11 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -6627,8 +6627,8 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
@@ -6636,11 +6636,11 @@ define <4 x double> @test_4xdouble_shuff
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
 ; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -6650,11 +6650,11 @@ define <4 x double> @test_4xdouble_maske
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -6664,8 +6664,8 @@ define <4 x double> @test_4xdouble_zero_
 define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
 ; CHECK-LABEL: test_8xdouble_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   ret <8 x double> %res
 }
@@ -6673,10 +6673,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -6686,9 +6686,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -6697,10 +6697,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -6710,9 +6710,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-70, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -6721,10 +6721,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $30, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -6734,9 +6734,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $30, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -6744,8 +6744,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
 ; CHECK-LABEL: test_8xdouble_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   ret <8 x double> %res
 }
@@ -6753,10 +6753,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $56, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -6766,9 +6766,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $56, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -6776,8 +6776,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; CHECK-LABEL: test_8xdouble_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   ret <8 x double> %res
@@ -6786,10 +6786,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $95, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
@@ -6800,9 +6800,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $95, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -6813,10 +6813,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
@@ -6827,9 +6827,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -6840,10 +6840,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
@@ -6854,9 +6854,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -6866,8 +6866,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; CHECK-LABEL: test_8xdouble_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   ret <8 x double> %res
@@ -6876,10 +6876,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
@@ -6890,9 +6890,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -6903,7 +6903,7 @@ define <8 x i32> @test_8xi32_shuff_mask0
 ; CHECK-LABEL: test_8xi32_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %res
 }
@@ -6912,9 +6912,9 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $26, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
@@ -6925,9 +6925,9 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $26, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -6937,9 +6937,9 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
@@ -6950,9 +6950,9 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -6962,9 +6962,9 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $51, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
@@ -6975,9 +6975,9 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $51, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -6986,7 +6986,7 @@ define <8 x i32> @test_8xi32_shuff_mask3
 ; CHECK-LABEL: test_8xi32_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i32> %res
 }
@@ -6995,9 +6995,9 @@ define <8 x i32> @test_8xi32_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $92, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
   ret <8 x i32> %res
@@ -7008,9 +7008,9 @@ define <8 x i32> @test_8xi32_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $92, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
@@ -7018,8 +7018,8 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %res
@@ -7027,11 +7027,11 @@ define <8 x i32> @test_8xi32_shuff_mem_m
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7041,11 +7041,11 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -7055,11 +7055,11 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7069,11 +7069,11 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -7083,11 +7083,11 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $113, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7097,11 +7097,11 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $113, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -7111,8 +7111,8 @@ define <8 x i32> @test_8xi32_zero_masked
 define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i32> %res
@@ -7120,11 +7120,11 @@ define <8 x i32> @test_8xi32_shuff_mem_m
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
 ; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $45, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7134,11 +7134,11 @@ define <8 x i32> @test_8xi32_masked_shuf
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
 ; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $45, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
@@ -7148,8 +7148,8 @@ define <8 x i32> @test_8xi32_zero_masked
 define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_16xi32_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i32> %res
 }
@@ -7158,10 +7158,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
@@ -7172,9 +7172,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -7184,10 +7184,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
@@ -7198,9 +7198,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -7210,10 +7210,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
@@ -7224,9 +7224,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -7234,8 +7234,8 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
 ; CHECK-LABEL: test_16xi32_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i32> %res
 }
@@ -7244,10 +7244,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
   ret <16 x i32> %res
@@ -7258,9 +7258,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
@@ -7268,8 +7268,8 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
 ; CHECK-LABEL: test_16xi32_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   ret <16 x i32> %res
@@ -7279,10 +7279,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7294,9 +7294,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -7308,10 +7308,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7323,9 +7323,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -7337,10 +7337,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7352,9 +7352,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -7364,8 +7364,8 @@ define <16 x i32> @test_16xi32_zero_mask
 define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
 ; CHECK-LABEL: test_16xi32_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i32> %res
@@ -7375,10 +7375,10 @@ define <16 x i32> @test_16xi32_masked_sh
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7390,9 +7390,9 @@ define <16 x i32> @test_16xi32_zero_mask
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
@@ -7403,7 +7403,7 @@ define <4 x i64> @test_4xi64_shuff_mask0
 ; CHECK-LABEL: test_4xi64_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x i64> %res
 }
@@ -7412,9 +7412,9 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
@@ -7425,9 +7425,9 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -7437,9 +7437,9 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
@@ -7450,9 +7450,9 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -7462,9 +7462,9 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
@@ -7475,9 +7475,9 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -7486,7 +7486,7 @@ define <4 x i64> @test_4xi64_shuff_mask3
 ; CHECK-LABEL: test_4xi64_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
 }
@@ -7495,9 +7495,9 @@ define <4 x i64> @test_4xi64_masked_shuf
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
   ret <4 x i64> %res
@@ -7508,9 +7508,9 @@ define <4 x i64> @test_4xi64_zero_masked
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
@@ -7518,8 +7518,8 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
@@ -7527,11 +7527,11 @@ define <4 x i64> @test_4xi64_shuff_mem_m
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7541,11 +7541,11 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -7555,11 +7555,11 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7569,11 +7569,11 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -7583,11 +7583,11 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7597,11 +7597,11 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -7611,8 +7611,8 @@ define <4 x i64> @test_4xi64_zero_masked
 define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
@@ -7620,11 +7620,11 @@ define <4 x i64> @test_4xi64_shuff_mem_m
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
 ; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7634,11 +7634,11 @@ define <4 x i64> @test_4xi64_masked_shuf
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
 ; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -7648,8 +7648,8 @@ define <4 x i64> @test_4xi64_zero_masked
 define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_8xi64_shuff_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   ret <8 x i64> %res
 }
@@ -7657,10 +7657,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-15, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
@@ -7670,9 +7670,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-15, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -7681,10 +7681,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-17, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
@@ -7694,9 +7694,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-17, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -7705,10 +7705,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
@@ -7718,9 +7718,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -7728,8 +7728,8 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
 ; CHECK-LABEL: test_8xi64_shuff_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   ret <8 x i64> %res
 }
@@ -7737,10 +7737,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
   ret <8 x i64> %res
@@ -7750,9 +7750,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
@@ -7760,8 +7760,8 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
 ; CHECK-LABEL: test_8xi64_shuff_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   ret <8 x i64> %res
@@ -7770,10 +7770,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
@@ -7784,9 +7784,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -7797,10 +7797,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
@@ -7811,9 +7811,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -7824,10 +7824,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
@@ -7838,9 +7838,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -7850,8 +7850,8 @@ define <8 x i64> @test_8xi64_zero_masked
 define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
 ; CHECK-LABEL: test_8xi64_shuff_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   ret <8 x i64> %res
@@ -7860,10 +7860,10 @@ define <8 x i64> @test_8xi64_masked_shuf
 ; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
@@ -7874,9 +7874,9 @@ define <8 x i64> @test_8xi64_zero_masked
 ; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
@@ -7887,7 +7887,7 @@ define <4 x float> @test_4xfloat_unpack_
 ; CHECK-LABEL: test_4xfloat_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
 }
@@ -7895,10 +7895,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -7908,9 +7908,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -7919,10 +7919,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -7932,9 +7932,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -7943,10 +7943,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -7956,9 +7956,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -7967,7 +7967,7 @@ define <4 x float> @test_4xfloat_unpack_
 ; CHECK-LABEL: test_4xfloat_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
 }
@@ -7975,10 +7975,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -7988,9 +7988,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -7998,8 +7998,8 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
 ; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
@@ -8008,10 +8008,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
@@ -8022,9 +8022,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -8035,10 +8035,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
@@ -8049,9 +8049,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -8062,10 +8062,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
@@ -8076,9 +8076,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -8088,8 +8088,8 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
 ; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
@@ -8098,10 +8098,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
@@ -8112,9 +8112,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -8125,7 +8125,7 @@ define <8 x float> @test_8xfloat_unpack_
 ; CHECK-LABEL: test_8xfloat_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
 }
@@ -8133,10 +8133,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $122, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -8146,9 +8146,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $122, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -8157,10 +8157,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-107, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -8170,9 +8170,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-107, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -8181,10 +8181,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -8194,9 +8194,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -8205,7 +8205,7 @@ define <8 x float> @test_8xfloat_unpack_
 ; CHECK-LABEL: test_8xfloat_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
 }
@@ -8213,10 +8213,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-127, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -8226,9 +8226,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-127, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -8236,8 +8236,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
@@ -8246,10 +8246,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
@@ -8260,9 +8260,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $72, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -8273,10 +8273,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-64, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -8287,9 +8287,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-64, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -8300,10 +8300,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -8314,9 +8314,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -8326,8 +8326,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
@@ -8336,10 +8336,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
@@ -8350,9 +8350,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -8362,8 +8362,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
 ; CHECK-LABEL: test_16xfloat_unpack_low_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   ret <16 x float> %res
 }
@@ -8372,10 +8372,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -8386,9 +8386,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -8398,10 +8398,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -8412,9 +8412,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -8424,10 +8424,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -8438,9 +8438,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -8448,8 +8448,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
 ; CHECK-LABEL: test_16xfloat_unpack_low_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   ret <16 x float> %res
 }
@@ -8458,10 +8458,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -8472,9 +8472,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -8482,8 +8482,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   ret <16 x float> %res
@@ -8493,10 +8493,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $20326, %ax # imm = 0x4F66
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
@@ -8508,9 +8508,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $20326, %ax # imm = 0x4F66
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -8522,10 +8522,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-17707, %ax # imm = 0xBAD5
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -8537,9 +8537,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-17707, %ax # imm = 0xBAD5
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -8551,10 +8551,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6631, %ax # imm = 0xE619
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -8566,9 +8566,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-6631, %ax # imm = 0xE619
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -8578,8 +8578,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   ret <16 x float> %res
@@ -8589,10 +8589,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-20711, %ax # imm = 0xAF19
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -8604,9 +8604,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-20711, %ax # imm = 0xAF19
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -8617,7 +8617,7 @@ define <2 x double> @test_2xdouble_unpac
 ; CHECK-LABEL: test_2xdouble_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %res
 }
@@ -8625,10 +8625,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
@@ -8638,9 +8638,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
@@ -8649,10 +8649,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
@@ -8662,9 +8662,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
@@ -8672,8 +8672,8 @@ define <2 x double> @test_2xdouble_zero_
 define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
 ; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %res
@@ -8682,10 +8682,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
@@ -8696,9 +8696,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
@@ -8709,10 +8709,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
@@ -8723,9 +8723,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
@@ -8736,7 +8736,7 @@ define <4 x double> @test_4xdouble_unpac
 ; CHECK-LABEL: test_4xdouble_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x double> %res
 }
@@ -8744,10 +8744,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -8757,9 +8757,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -8768,10 +8768,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -8781,9 +8781,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -8792,10 +8792,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -8805,9 +8805,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -8816,7 +8816,7 @@ define <4 x double> @test_4xdouble_unpac
 ; CHECK-LABEL: test_4xdouble_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x double> %res
 }
@@ -8824,10 +8824,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -8837,9 +8837,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -8847,8 +8847,8 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x double> %res
@@ -8857,10 +8857,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
@@ -8871,9 +8871,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -8884,10 +8884,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -8898,9 +8898,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -8911,10 +8911,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
@@ -8925,9 +8925,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -8937,8 +8937,8 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x double> %res
@@ -8947,10 +8947,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec3
@@ -8961,9 +8961,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -8973,8 +8973,8 @@ define <4 x double> @test_4xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
 ; CHECK-LABEL: test_8xdouble_unpack_low_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x double> %res
 }
@@ -8982,10 +8982,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-73, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -8995,9 +8995,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-73, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -9006,10 +9006,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $102, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -9019,9 +9019,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $102, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -9030,10 +9030,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-46, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -9043,9 +9043,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-46, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -9053,8 +9053,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
 ; CHECK-LABEL: test_8xdouble_unpack_low_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x double> %res
 }
@@ -9062,10 +9062,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -9075,9 +9075,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -9085,8 +9085,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x double> %res
@@ -9095,10 +9095,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
@@ -9109,9 +9109,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -9122,10 +9122,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
@@ -9136,9 +9136,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $126, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -9149,10 +9149,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
@@ -9163,9 +9163,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -9175,8 +9175,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x double> %res
@@ -9185,10 +9185,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
@@ -9199,9 +9199,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -9212,7 +9212,7 @@ define <4 x float> @test_4xfloat_unpack_
 ; CHECK-LABEL: test_4xfloat_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   ret <4 x float> %res
 }
@@ -9220,10 +9220,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -9233,9 +9233,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -9244,10 +9244,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -9257,9 +9257,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -9268,10 +9268,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -9281,9 +9281,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -9292,7 +9292,7 @@ define <4 x float> @test_4xfloat_unpack_
 ; CHECK-LABEL: test_4xfloat_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   ret <4 x float> %res
 }
@@ -9300,10 +9300,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
   ret <4 x float> %res
@@ -9313,9 +9313,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
@@ -9323,8 +9323,8 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
 ; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   ret <4 x float> %res
@@ -9333,10 +9333,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
@@ -9347,9 +9347,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -9360,10 +9360,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec3
@@ -9374,9 +9374,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -9387,10 +9387,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
@@ -9401,9 +9401,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -9413,8 +9413,8 @@ define <4 x float> @test_4xfloat_zero_ma
 define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
 ; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   ret <4 x float> %res
@@ -9423,10 +9423,10 @@ define <4 x float> @test_4xfloat_masked_
 ; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
@@ -9437,9 +9437,9 @@ define <4 x float> @test_4xfloat_zero_ma
 ; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -9450,7 +9450,7 @@ define <8 x float> @test_8xfloat_unpack_
 ; CHECK-LABEL: test_8xfloat_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x float> %res
 }
@@ -9458,10 +9458,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $21, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -9471,9 +9471,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $21, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -9482,10 +9482,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -9495,9 +9495,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -9506,10 +9506,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -9519,9 +9519,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-126, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -9530,7 +9530,7 @@ define <8 x float> @test_8xfloat_unpack_
 ; CHECK-LABEL: test_8xfloat_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x float> %res
 }
@@ -9538,10 +9538,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-19, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
   ret <8 x float> %res
@@ -9551,9 +9551,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-19, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
@@ -9561,8 +9561,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x float> %res
@@ -9571,10 +9571,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $28, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec3
@@ -9585,9 +9585,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $28, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -9598,10 +9598,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -9612,9 +9612,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -9625,10 +9625,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -9639,9 +9639,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -9651,8 +9651,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
 ; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   ret <8 x float> %res
@@ -9661,10 +9661,10 @@ define <8 x float> @test_8xfloat_masked_
 ; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
@@ -9675,9 +9675,9 @@ define <8 x float> @test_8xfloat_zero_ma
 ; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
@@ -9687,8 +9687,8 @@ define <8 x float> @test_8xfloat_zero_ma
 define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
 ; CHECK-LABEL: test_16xfloat_unpack_high_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   ret <16 x float> %res
 }
@@ -9697,10 +9697,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -9711,9 +9711,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -9723,10 +9723,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -9737,9 +9737,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -9749,10 +9749,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-2371, %ax # imm = 0xF6BD
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -9763,9 +9763,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-2371, %ax # imm = 0xF6BD
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -9773,8 +9773,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
 ; CHECK-LABEL: test_16xfloat_unpack_high_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   ret <16 x float> %res
 }
@@ -9783,10 +9783,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26006, %ax # imm = 0x9A6A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
   ret <16 x float> %res
@@ -9797,9 +9797,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26006, %ax # imm = 0x9A6A
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
@@ -9807,8 +9807,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   ret <16 x float> %res
@@ -9818,10 +9818,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -9833,9 +9833,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -9847,10 +9847,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
@@ -9862,9 +9862,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -9876,10 +9876,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
@@ -9891,9 +9891,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -9903,8 +9903,8 @@ define <16 x float> @test_16xfloat_zero_
 define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
 ; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   ret <16 x float> %res
@@ -9914,10 +9914,10 @@ define <16 x float> @test_16xfloat_maske
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
@@ -9929,9 +9929,9 @@ define <16 x float> @test_16xfloat_zero_
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
 ; CHECK-NEXT:    # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
   %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
@@ -9942,7 +9942,7 @@ define <2 x double> @test_2xdouble_unpac
 ; CHECK-LABEL: test_2xdouble_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %res
 }
@@ -9950,10 +9950,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
@@ -9963,9 +9963,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
@@ -9974,10 +9974,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
   ret <2 x double> %res
@@ -9987,9 +9987,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
   ret <2 x double> %res
@@ -9997,8 +9997,8 @@ define <2 x double> @test_2xdouble_zero_
 define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
 ; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %res
@@ -10007,10 +10007,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
@@ -10021,9 +10021,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
@@ -10034,10 +10034,10 @@ define <2 x double> @test_2xdouble_maske
 ; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
@@ -10048,9 +10048,9 @@ define <2 x double> @test_2xdouble_zero_
 ; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <2 x double>, <2 x double>* %vec2p
   %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
   %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
@@ -10061,7 +10061,7 @@ define <4 x double> @test_4xdouble_unpac
 ; CHECK-LABEL: test_4xdouble_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x double> %res
 }
@@ -10069,10 +10069,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -10082,9 +10082,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $9, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -10093,10 +10093,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -10106,9 +10106,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -10117,10 +10117,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -10130,9 +10130,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -10141,7 +10141,7 @@ define <4 x double> @test_4xdouble_unpac
 ; CHECK-LABEL: test_4xdouble_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x double> %res
 }
@@ -10149,10 +10149,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec3
   ret <4 x double> %res
@@ -10162,9 +10162,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
@@ -10172,8 +10172,8 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x double> %res
@@ -10182,10 +10182,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -10196,9 +10196,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -10209,10 +10209,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -10223,9 +10223,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -10236,10 +10236,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -10250,9 +10250,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -10262,8 +10262,8 @@ define <4 x double> @test_4xdouble_zero_
 define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
 ; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x double> %res
@@ -10272,10 +10272,10 @@ define <4 x double> @test_4xdouble_maske
 ; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
@@ -10286,9 +10286,9 @@ define <4 x double> @test_4xdouble_zero_
 ; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -10298,8 +10298,8 @@ define <4 x double> @test_4xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
 ; CHECK-LABEL: test_8xdouble_unpack_high_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x double> %res
 }
@@ -10307,10 +10307,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-27, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -10320,9 +10320,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-27, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -10331,10 +10331,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-21, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -10344,9 +10344,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-21, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -10355,10 +10355,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-118, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -10368,9 +10368,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-118, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -10378,8 +10378,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
 ; CHECK-LABEL: test_8xdouble_unpack_high_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x double> %res
 }
@@ -10387,10 +10387,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $100, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
 ; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
   ret <8 x double> %res
@@ -10400,9 +10400,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $100, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
@@ -10410,8 +10410,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x double> %res
@@ -10420,10 +10420,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
@@ -10434,9 +10434,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -10447,10 +10447,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $71, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
@@ -10461,9 +10461,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $71, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -10474,10 +10474,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-49, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
@@ -10488,9 +10488,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-49, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
@@ -10500,8 +10500,8 @@ define <8 x double> @test_8xdouble_zero_
 define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
 ; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x double> %res
@@ -10510,10 +10510,10 @@ define <8 x double> @test_8xdouble_maske
 ; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-40, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
@@ -10524,9 +10524,9 @@ define <8 x double> @test_8xdouble_zero_
 ; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movb $-40, %al # sched: [1:0.25]
-; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
-; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1 # sched: [1:1.00]
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
+; CHECK-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer

Modified: llvm/trunk/test/CodeGen/X86/fma-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fma-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fma-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fma-schedule.ll Sun Oct  8 05:52:54 2017
@@ -41,9 +41,9 @@ define <2 x double> @test_vfmadd213pd(<2
 ;
 ; SKX-LABEL: test_vfmadd213pd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmadd213pd:
 ; ZNVER1:       # BB#0:
@@ -83,9 +83,9 @@ define <4 x double> @test_vfmadd213pd_ym
 ;
 ; SKX-LABEL: test_vfmadd213pd_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmadd213pd_ymm:
 ; ZNVER1:       # BB#0:
@@ -125,9 +125,9 @@ define <4 x float> @test_vfmadd213ps(<4
 ;
 ; SKX-LABEL: test_vfmadd213ps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmadd213ps:
 ; ZNVER1:       # BB#0:
@@ -167,9 +167,9 @@ define <8 x float> @test_vfmadd213ps_ymm
 ;
 ; SKX-LABEL: test_vfmadd213ps_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmadd213ps_ymm:
 ; ZNVER1:       # BB#0:
@@ -209,9 +209,9 @@ define <2 x double> @test_vfmadd213sd(<2
 ;
 ; SKX-LABEL: test_vfmadd213sd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmadd213sd:
 ; ZNVER1:       # BB#0:
@@ -251,9 +251,9 @@ define <4 x float> @test_vfmadd213ss(<4
 ;
 ; SKX-LABEL: test_vfmadd213ss:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmadd213ss:
 ; ZNVER1:       # BB#0:
@@ -305,9 +305,9 @@ define <2 x double> @test_vfmaddsubpd(<2
 ;
 ; SKX-LABEL: test_vfmaddsubpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmaddsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddsubpd:
 ; ZNVER1:       # BB#0:
@@ -347,9 +347,9 @@ define <4 x double> @test_vfmaddsubpd_ym
 ;
 ; SKX-LABEL: test_vfmaddsubpd_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmaddsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddsubpd_ymm:
 ; ZNVER1:       # BB#0:
@@ -389,9 +389,9 @@ define <4 x float> @test_vfmaddsubps(<4
 ;
 ; SKX-LABEL: test_vfmaddsubps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmaddsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddsubps:
 ; ZNVER1:       # BB#0:
@@ -431,9 +431,9 @@ define <8 x float> @test_vfmaddsubps_ymm
 ;
 ; SKX-LABEL: test_vfmaddsubps_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmaddsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmaddsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmaddsubps_ymm:
 ; ZNVER1:       # BB#0:
@@ -485,9 +485,9 @@ define <2 x double> @test_vfmsubaddpd(<2
 ;
 ; SKX-LABEL: test_vfmsubaddpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsubadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubaddpd:
 ; ZNVER1:       # BB#0:
@@ -527,9 +527,9 @@ define <4 x double> @test_vfmsubaddpd_ym
 ;
 ; SKX-LABEL: test_vfmsubaddpd_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsubadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubaddpd_ymm:
 ; ZNVER1:       # BB#0:
@@ -569,9 +569,9 @@ define <4 x float> @test_vfmsubaddps(<4
 ;
 ; SKX-LABEL: test_vfmsubaddps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsubadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubaddps:
 ; ZNVER1:       # BB#0:
@@ -611,9 +611,9 @@ define <8 x float> @test_vfmsubaddps_ymm
 ;
 ; SKX-LABEL: test_vfmsubaddps_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsubadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsubadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsubaddps_ymm:
 ; ZNVER1:       # BB#0:
@@ -665,9 +665,9 @@ define <2 x double> @test_vfmsub213pd(<2
 ;
 ; SKX-LABEL: test_vfmsub213pd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsub213pd:
 ; ZNVER1:       # BB#0:
@@ -707,9 +707,9 @@ define <4 x double> @test_vfmsub213pd_ym
 ;
 ; SKX-LABEL: test_vfmsub213pd_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsub213pd_ymm:
 ; ZNVER1:       # BB#0:
@@ -749,9 +749,9 @@ define <4 x float> @test_vfmsub213ps(<4
 ;
 ; SKX-LABEL: test_vfmsub213ps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsub213ps:
 ; ZNVER1:       # BB#0:
@@ -791,9 +791,9 @@ define <8 x float> @test_vfmsub213ps_ymm
 ;
 ; SKX-LABEL: test_vfmsub213ps_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsub213ps_ymm:
 ; ZNVER1:       # BB#0:
@@ -833,9 +833,9 @@ define <2 x double> @test_vfmsub213sd(<2
 ;
 ; SKX-LABEL: test_vfmsub213sd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsub213sd:
 ; ZNVER1:       # BB#0:
@@ -875,9 +875,9 @@ define <4 x float> @test_vfmsub213ss(<4
 ;
 ; SKX-LABEL: test_vfmsub213ss:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfmsub213ss:
 ; ZNVER1:       # BB#0:
@@ -929,9 +929,9 @@ define <2 x double> @test_vfnmadd213pd(<
 ;
 ; SKX-LABEL: test_vfnmadd213pd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmadd213pd:
 ; ZNVER1:       # BB#0:
@@ -971,9 +971,9 @@ define <4 x double> @test_vfnmadd213pd_y
 ;
 ; SKX-LABEL: test_vfnmadd213pd_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmadd213pd_ymm:
 ; ZNVER1:       # BB#0:
@@ -1013,9 +1013,9 @@ define <4 x float> @test_vfnmadd213ps(<4
 ;
 ; SKX-LABEL: test_vfnmadd213ps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmadd213ps:
 ; ZNVER1:       # BB#0:
@@ -1055,9 +1055,9 @@ define <8 x float> @test_vfnmadd213ps_ym
 ;
 ; SKX-LABEL: test_vfnmadd213ps_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmadd213ps_ymm:
 ; ZNVER1:       # BB#0:
@@ -1097,9 +1097,9 @@ define <2 x double> @test_vfnmadd213sd(<
 ;
 ; SKX-LABEL: test_vfnmadd213sd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmadd213sd:
 ; ZNVER1:       # BB#0:
@@ -1139,9 +1139,9 @@ define <4 x float> @test_vfnmadd213ss(<4
 ;
 ; SKX-LABEL: test_vfnmadd213ss:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmadd213ss:
 ; ZNVER1:       # BB#0:
@@ -1193,9 +1193,9 @@ define <2 x double> @test_vfnmsub213pd(<
 ;
 ; SKX-LABEL: test_vfnmsub213pd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmsub213pd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub213pd (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsub213pd:
 ; ZNVER1:       # BB#0:
@@ -1235,9 +1235,9 @@ define <4 x double> @test_vfnmsub213pd_y
 ;
 ; SKX-LABEL: test_vfnmsub213pd_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmsub213pd %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub213pd (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsub213pd_ymm:
 ; ZNVER1:       # BB#0:
@@ -1277,9 +1277,9 @@ define <4 x float> @test_vfnmsub213ps(<4
 ;
 ; SKX-LABEL: test_vfnmsub213ps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmsub213ps %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub213ps (%rdi), %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsub213ps:
 ; ZNVER1:       # BB#0:
@@ -1319,9 +1319,9 @@ define <8 x float> @test_vfnmsub213ps_ym
 ;
 ; SKX-LABEL: test_vfnmsub213ps_ymm:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmsub213ps %ymm2, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub213ps (%rdi), %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsub213ps_ymm:
 ; ZNVER1:       # BB#0:
@@ -1361,9 +1361,9 @@ define <2 x double> @test_vfnmsub213sd(<
 ;
 ; SKX-LABEL: test_vfnmsub213sd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub213sd (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsub213sd:
 ; ZNVER1:       # BB#0:
@@ -1403,9 +1403,9 @@ define <4 x float> @test_vfnmsub213ss(<4
 ;
 ; SKX-LABEL: test_vfnmsub213ss:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmsub213ss %xmm2, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfnmsub213ss (%rdi), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; ZNVER1-LABEL: test_vfnmsub213ss:
 ; ZNVER1:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath.ll Sun Oct  8 05:52:54 2017
@@ -69,9 +69,9 @@ define float @f32_no_estimate(float %x)
 ;
 ; SKX-LABEL: f32_no_estimate:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vdivss %xmm0, %xmm1, %xmm0 # sched: [11:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -151,10 +151,10 @@ define float @f32_one_step(float %x) #1
 ;
 ; SKX-LABEL: f32_one_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -268,14 +268,14 @@ define float @f32_two_step(float %x) #2
 ;
 ; SKX-LABEL: f32_two_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
+; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -332,9 +332,9 @@ define <4 x float> @v4f32_no_estimate(<4
 ;
 ; SKX-LABEL: v4f32_no_estimate:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [1:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] sched: [6:0.50]
 ; SKX-NEXT:    vdivps %xmm0, %xmm1, %xmm0 # sched: [11:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -416,10 +416,10 @@ define <4 x float> @v4f32_one_step(<4 x
 ;
 ; SKX-LABEL: v4f32_one_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -533,14 +533,14 @@ define <4 x float> @v4f32_two_step(<4 x
 ;
 ; SKX-LABEL: v4f32_two_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -600,9 +600,9 @@ define <8 x float> @v8f32_no_estimate(<8
 ;
 ; SKX-LABEL: v8f32_no_estimate:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
 ; SKX-NEXT:    vdivps %ymm0, %ymm1, %ymm0 # sched: [11:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -691,10 +691,10 @@ define <8 x float> @v8f32_one_step(<8 x
 ;
 ; SKX-LABEL: v8f32_one_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -821,14 +821,14 @@ define <8 x float> @v8f32_two_step(<8 x
 ;
 ; SKX-LABEL: v8f32_two_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }

Modified: llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/recip-fastmath2.ll Sun Oct  8 05:52:54 2017
@@ -62,9 +62,9 @@ define float @f32_no_step_2(float %x) #3
 ;
 ; SKX-LABEL: f32_no_step_2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast float 1234.0, %x
   ret float %div
 }
@@ -152,11 +152,11 @@ define float @f32_one_step_2(float %x) #
 ;
 ; SKX-LABEL: f32_one_step_2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast float 3456.0, %x
   ret float %div
 }
@@ -252,12 +252,12 @@ define float @f32_one_step_2_divs(float
 ;
 ; SKX-LABEL: f32_one_step_2_divs:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [4:0.50]
-; SKX-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1 # sched: [9:0.50]
+; SKX-NEXT:    vmulss %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast float 3456.0, %x
   %div2 = fdiv fast float %div, %x
   ret float %div2
@@ -380,15 +380,15 @@ define float @f32_two_step_2(float %x) #
 ;
 ; SKX-LABEL: f32_two_step_2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
-; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
+; SKX-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast float 6789.0, %x
   ret float %div
 }
@@ -478,11 +478,11 @@ define <4 x float> @v4f32_one_step2(<4 x
 ;
 ; SKX-LABEL: v4f32_one_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -580,12 +580,12 @@ define <4 x float> @v4f32_one_step_2_div
 ;
 ; SKX-LABEL: v4f32_one_step_2_divs:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [4:0.50]
-; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1 # sched: [10:0.50]
+; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   %div2 = fdiv fast <4 x float> %div, %x
   ret <4 x float> %div2
@@ -708,15 +708,15 @@ define <4 x float> @v4f32_two_step2(<4 x
 ;
 ; SKX-LABEL: v4f32_two_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [1:0.50]
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1 # sched: [4:1.00]
+; SKX-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,1,1,1] sched: [6:0.50]
 ; SKX-NEXT:    vmovaps %xmm1, %xmm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -814,11 +814,11 @@ define <8 x float> @v8f32_one_step2(<8 x
 ;
 ; SKX-LABEL: v8f32_one_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -925,12 +925,12 @@ define <8 x float> @v8f32_one_step_2_div
 ;
 ; SKX-LABEL: v8f32_one_step_2_divs:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [4:0.50]
-; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1 # sched: [11:0.50]
+; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   %div2 = fdiv fast <8 x float> %div, %x
   ret <8 x float> %div2
@@ -1067,15 +1067,15 @@ define <8 x float> @v8f32_two_step2(<8 x
 ;
 ; SKX-LABEL: v8f32_two_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
-; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [1:0.50]
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1 # sched: [4:1.00]
+; SKX-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1] sched: [7:0.50]
 ; SKX-NEXT:    vmovaps %ymm1, %ymm3 # sched: [1:1.00]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.50]
-; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3 # sched: [4:0.33]
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -1124,8 +1124,8 @@ define <8 x float> @v8f32_no_step(<8 x f
 ;
 ; SKX-LABEL: v8f32_no_step:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -1183,9 +1183,9 @@ define <8 x float> @v8f32_no_step2(<8 x
 ;
 ; SKX-LABEL: v8f32_no_step2:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
-; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm0 # sched: [4:1.00]
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0 # sched: [11:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }

Modified: llvm/trunk/test/CodeGen/X86/sse-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-schedule.ll Sun Oct  8 05:52:54 2017
@@ -49,9 +49,9 @@ define <4 x float> @test_addps(<4 x floa
 ;
 ; SKX-LABEL: test_addps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vaddps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_addps:
 ; BTVER2:       # BB#0:
@@ -109,9 +109,9 @@ define float @test_addss(float %a0, floa
 ;
 ; SKX-LABEL: test_addss:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vaddss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_addss:
 ; BTVER2:       # BB#0:
@@ -173,9 +173,9 @@ define <4 x float> @test_andps(<4 x floa
 ;
 ; SKX-LABEL: test_andps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vandps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_andps:
 ; BTVER2:       # BB#0:
@@ -241,9 +241,9 @@ define <4 x float> @test_andnotps(<4 x f
 ;
 ; SKX-LABEL: test_andnotps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vandnps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_andnotps:
 ; BTVER2:       # BB#0:
@@ -313,11 +313,11 @@ define <4 x float> @test_cmpps(<4 x floa
 ;
 ; SKX-LABEL: test_cmpps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0
-; SKX-NEXT:    vcmpeqps (%rdi), %xmm0, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vcmpeqps (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korw %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cmpps:
 ; BTVER2:       # BB#0:
@@ -382,7 +382,7 @@ define float @test_cmpss(float %a0, floa
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cmpss:
 ; BTVER2:       # BB#0:
@@ -494,16 +494,16 @@ define i32 @test_comiss(<4 x float> %a0,
 ; SKX-LABEL: test_comiss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %cl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %cl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; SKX-NEXT:    vcomiss (%rdi), %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %dl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %dl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; SKX-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_comiss:
 ; BTVER2:       # BB#0:
@@ -587,8 +587,8 @@ define float @test_cvtsi2ss(i32 %a0, i32
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
 ; SKX-NEXT:    vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
-; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2ss:
 ; BTVER2:       # BB#0:
@@ -657,8 +657,8 @@ define float @test_cvtsi2ssq(i64 %a0, i6
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [6:2.00]
 ; SKX-NEXT:    vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
-; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2ssq:
 ; BTVER2:       # BB#0:
@@ -726,9 +726,9 @@ define i32 @test_cvtss2si(float %a0, flo
 ; SKX-LABEL: test_cvtss2si:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtss2si %xmm0, %ecx # sched: [6:1.00]
-; SKX-NEXT:    vcvtss2si (%rdi), %eax # sched: [6:1.00]
+; SKX-NEXT:    vcvtss2si (%rdi), %eax # sched: [11:1.00]
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2si:
 ; BTVER2:       # BB#0:
@@ -799,9 +799,9 @@ define i64 @test_cvtss2siq(float %a0, fl
 ; SKX-LABEL: test_cvtss2siq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtss2si %xmm0, %rcx # sched: [6:1.00]
-; SKX-NEXT:    vcvtss2si (%rdi), %rax # sched: [6:1.00]
+; SKX-NEXT:    vcvtss2si (%rdi), %rax # sched: [11:1.00]
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2siq:
 ; BTVER2:       # BB#0:
@@ -872,9 +872,9 @@ define i32 @test_cvttss2si(float %a0, fl
 ; SKX-LABEL: test_cvttss2si:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvttss2si %xmm0, %ecx # sched: [7:1.00]
-; SKX-NEXT:    vcvttss2si (%rdi), %eax # sched: [6:1.00]
+; SKX-NEXT:    vcvttss2si (%rdi), %eax # sched: [11:1.00]
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvttss2si:
 ; BTVER2:       # BB#0:
@@ -942,9 +942,9 @@ define i64 @test_cvttss2siq(float %a0, f
 ; SKX-LABEL: test_cvttss2siq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvttss2si %xmm0, %rcx # sched: [7:1.00]
-; SKX-NEXT:    vcvttss2si (%rdi), %rax # sched: [6:1.00]
+; SKX-NEXT:    vcvttss2si (%rdi), %rax # sched: [11:1.00]
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvttss2siq:
 ; BTVER2:       # BB#0:
@@ -1006,8 +1006,8 @@ define <4 x float> @test_divps(<4 x floa
 ; SKX-LABEL: test_divps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
-; SKX-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vdivps (%rdi), %xmm0, %xmm0 # sched: [17:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_divps:
 ; BTVER2:       # BB#0:
@@ -1066,8 +1066,8 @@ define float @test_divss(float %a0, floa
 ; SKX-LABEL: test_divss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # sched: [11:1.00]
-; SKX-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vdivss (%rdi), %xmm0, %xmm0 # sched: [16:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_divss:
 ; BTVER2:       # BB#0:
@@ -1126,8 +1126,8 @@ define void @test_ldmxcsr(i32 %a0) {
 ; SKX-LABEL: test_ldmxcsr:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SKX-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vldmxcsr -{{[0-9]+}}(%rsp) # sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_ldmxcsr:
 ; BTVER2:       # BB#0:
@@ -1189,7 +1189,7 @@ define <4 x float> @test_maxps(<4 x floa
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maxps:
 ; BTVER2:       # BB#0:
@@ -1249,8 +1249,8 @@ define <4 x float> @test_maxss(<4 x floa
 ; SKX-LABEL: test_maxss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maxss:
 ; BTVER2:       # BB#0:
@@ -1311,7 +1311,7 @@ define <4 x float> @test_minps(<4 x floa
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_minps:
 ; BTVER2:       # BB#0:
@@ -1371,8 +1371,8 @@ define <4 x float> @test_minss(<4 x floa
 ; SKX-LABEL: test_minss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vminss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vminss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_minss:
 ; BTVER2:       # BB#0:
@@ -1437,10 +1437,10 @@ define void @test_movaps(<4 x float> *%a
 ;
 ; SKX-LABEL: test_movaps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovaps (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovaps (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovaps %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movaps:
 ; BTVER2:       # BB#0:
@@ -1503,7 +1503,7 @@ define <4 x float> @test_movhlps(<4 x fl
 ; SKX-LABEL: test_movhlps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movhlps:
 ; BTVER2:       # BB#0:
@@ -1567,10 +1567,10 @@ define void @test_movhps(<4 x float> %a0
 ;
 ; SKX-LABEL: test_movhps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movhps:
 ; BTVER2:       # BB#0:
@@ -1637,8 +1637,8 @@ define <4 x float> @test_movlhps(<4 x fl
 ; SKX-LABEL: test_movlhps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movlhps:
 ; BTVER2:       # BB#0:
@@ -1701,10 +1701,10 @@ define void @test_movlps(<4 x float> %a0
 ;
 ; SKX-LABEL: test_movlps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovlps %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movlps:
 ; BTVER2:       # BB#0:
@@ -1765,7 +1765,7 @@ define i32 @test_movmskps(<4 x float> %a
 ; SKX-LABEL: test_movmskps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovmskps %xmm0, %eax # sched: [2:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movmskps:
 ; BTVER2:       # BB#0:
@@ -1821,7 +1821,7 @@ define void @test_movntps(<4 x float> %a
 ; SKX-LABEL: test_movntps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovntps %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movntps:
 ; BTVER2:       # BB#0:
@@ -1881,10 +1881,10 @@ define void @test_movss_mem(float* %a0,
 ;
 ; SKX-LABEL: test_movss_mem:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
-; SKX-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovss %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movss_mem:
 ; BTVER2:       # BB#0:
@@ -1945,7 +1945,7 @@ define <4 x float> @test_movss_reg(<4 x
 ; SKX-LABEL: test_movss_reg:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movss_reg:
 ; BTVER2:       # BB#0:
@@ -2005,10 +2005,10 @@ define void @test_movups(<4 x float> *%a
 ;
 ; SKX-LABEL: test_movups:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovups (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovups (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vaddps %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovups %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movups:
 ; BTVER2:       # BB#0:
@@ -2068,9 +2068,9 @@ define <4 x float> @test_mulps(<4 x floa
 ;
 ; SKX-LABEL: test_mulps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_mulps:
 ; BTVER2:       # BB#0:
@@ -2128,9 +2128,9 @@ define float @test_mulss(float %a0, floa
 ;
 ; SKX-LABEL: test_mulss:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_mulss:
 ; BTVER2:       # BB#0:
@@ -2192,9 +2192,9 @@ define <4 x float> @test_orps(<4 x float
 ;
 ; SKX-LABEL: test_orps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_orps:
 ; BTVER2:       # BB#0:
@@ -2256,8 +2256,8 @@ define void @test_prefetchnta(i8* %a0) {
 ;
 ; SKX-LABEL: test_prefetchnta:
 ; SKX:       # BB#0:
-; SKX-NEXT:    prefetchnta (%rdi) # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    prefetchnta (%rdi) # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_prefetchnta:
 ; BTVER2:       # BB#0:
@@ -2320,10 +2320,10 @@ define <4 x float> @test_rcpps(<4 x floa
 ;
 ; SKX-LABEL: test_rcpps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrcp14ps %xmm0, %xmm0
-; SKX-NEXT:    vrcp14ps (%rdi), %xmm1
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT:    vrcp14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_rcpps:
 ; BTVER2:       # BB#0:
@@ -2400,10 +2400,10 @@ define <4 x float> @test_rcpss(float %a0
 ; SKX-LABEL: test_rcpss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vrcpss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_rcpss:
 ; BTVER2:       # BB#0:
@@ -2477,10 +2477,10 @@ define <4 x float> @test_rsqrtps(<4 x fl
 ;
 ; SKX-LABEL: test_rsqrtps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vrsqrt14ps %xmm0, %xmm0
-; SKX-NEXT:    vrsqrt14ps (%rdi), %xmm1
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vrsqrt14ps %xmm0, %xmm0 # sched: [4:1.00]
+; SKX-NEXT:    vrsqrt14ps (%rdi), %xmm1 # sched: [10:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtps:
 ; BTVER2:       # BB#0:
@@ -2557,10 +2557,10 @@ define <4 x float> @test_rsqrtss(float %
 ; SKX-LABEL: test_rsqrtss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [4:1.00]
-; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [4:1.00]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_rsqrtss:
 ; BTVER2:       # BB#0:
@@ -2626,8 +2626,8 @@ define void @test_sfence() {
 ;
 ; SKX-LABEL: test_sfence:
 ; SKX:       # BB#0:
-; SKX-NEXT:    sfence # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    sfence # sched: [2:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_sfence:
 ; BTVER2:       # BB#0:
@@ -2687,8 +2687,8 @@ define <4 x float> @test_shufps(<4 x flo
 ; SKX-LABEL: test_shufps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:1.00]
-; SKX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_shufps:
 ; BTVER2:       # BB#0:
@@ -2754,9 +2754,9 @@ define <4 x float> @test_sqrtps(<4 x flo
 ; SKX-LABEL: test_sqrtps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vsqrtps %xmm0, %xmm0 # sched: [12:1.00]
-; SKX-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [12:1.00]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vsqrtps (%rdi), %xmm1 # sched: [18:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtps:
 ; BTVER2:       # BB#0:
@@ -2833,10 +2833,10 @@ define <4 x float> @test_sqrtss(<4 x flo
 ; SKX-LABEL: test_sqrtss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00]
-; SKX-NEXT:    vmovaps (%rdi), %xmm1 # sched: [1:0.50]
+; SKX-NEXT:    vmovaps (%rdi), %xmm1 # sched: [6:0.50]
 ; SKX-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:1.00]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtss:
 ; BTVER2:       # BB#0:
@@ -2900,9 +2900,9 @@ define i32 @test_stmxcsr() {
 ;
 ; SKX-LABEL: test_stmxcsr:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
-; SKX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vstmxcsr -{{[0-9]+}}(%rsp) # sched: [2:1.00]
+; SKX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # sched: [5:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_stmxcsr:
 ; BTVER2:       # BB#0:
@@ -2962,9 +2962,9 @@ define <4 x float> @test_subps(<4 x floa
 ;
 ; SKX-LABEL: test_subps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_subps:
 ; BTVER2:       # BB#0:
@@ -3022,9 +3022,9 @@ define float @test_subss(float %a0, floa
 ;
 ; SKX-LABEL: test_subss:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vsubss (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_subss:
 ; BTVER2:       # BB#0:
@@ -3131,16 +3131,16 @@ define i32 @test_ucomiss(<4 x float> %a0
 ; SKX-LABEL: test_ucomiss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vucomiss %xmm1, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %cl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %cl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; SKX-NEXT:    vucomiss (%rdi), %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %dl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %dl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; SKX-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_ucomiss:
 ; BTVER2:       # BB#0:
@@ -3221,8 +3221,8 @@ define <4 x float> @test_unpckhps(<4 x f
 ; SKX-LABEL: test_unpckhps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhps:
 ; BTVER2:       # BB#0:
@@ -3285,8 +3285,8 @@ define <4 x float> @test_unpcklps(<4 x f
 ; SKX-LABEL: test_unpcklps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklps:
 ; BTVER2:       # BB#0:
@@ -3348,9 +3348,9 @@ define <4 x float> @test_xorps(<4 x floa
 ;
 ; SKX-LABEL: test_xorps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vxorps (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_xorps:
 ; BTVER2:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/sse2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse2-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse2-schedule.ll Sun Oct  8 05:52:54 2017
@@ -49,9 +49,9 @@ define <2 x double> @test_addpd(<2 x dou
 ;
 ; SKX-LABEL: test_addpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vaddpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_addpd:
 ; BTVER2:       # BB#0:
@@ -109,9 +109,9 @@ define double @test_addsd(double %a0, do
 ;
 ; SKX-LABEL: test_addsd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_addsd:
 ; BTVER2:       # BB#0:
@@ -175,10 +175,10 @@ define <2 x double> @test_andpd(<2 x dou
 ;
 ; SKX-LABEL: test_andpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vandpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_andpd:
 ; BTVER2:       # BB#0:
@@ -249,10 +249,10 @@ define <2 x double> @test_andnotpd(<2 x
 ;
 ; SKX-LABEL: test_andnotpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vandnpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_andnotpd:
 ; BTVER2:       # BB#0:
@@ -325,11 +325,11 @@ define <2 x double> @test_cmppd(<2 x dou
 ;
 ; SKX-LABEL: test_cmppd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0
-; SKX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    vcmpeqpd %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vcmpeqpd (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korw %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cmppd:
 ; BTVER2:       # BB#0:
@@ -394,7 +394,7 @@ define double @test_cmpsd(double %a0, do
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cmpsd:
 ; BTVER2:       # BB#0:
@@ -506,16 +506,16 @@ define i32 @test_comisd(<2 x double> %a0
 ; SKX-LABEL: test_comisd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %cl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %cl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; SKX-NEXT:    vcomisd (%rdi), %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %dl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %dl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; SKX-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_comisd:
 ; BTVER2:       # BB#0:
@@ -598,9 +598,9 @@ define <2 x double> @test_cvtdq2pd(<4 x
 ; SKX-LABEL: test_cvtdq2pd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtdq2pd %xmm0, %xmm0 # sched: [5:1.00]
-; SKX-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [5:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vcvtdq2pd (%rdi), %xmm1 # sched: [11:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2pd:
 ; BTVER2:       # BB#0:
@@ -671,9 +671,9 @@ define <4 x float> @test_cvtdq2ps(<4 x i
 ; SKX-LABEL: test_cvtdq2ps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [4:0.50]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vcvtdq2ps (%rdi), %xmm1 # sched: [10:0.50]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtdq2ps:
 ; BTVER2:       # BB#0:
@@ -743,8 +743,8 @@ define <4 x i32> @test_cvtpd2dq(<2 x dou
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtpd2dq %xmm0, %xmm0 # sched: [5:1.00]
 ; SKX-NEXT:    vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2dq:
 ; BTVER2:       # BB#0:
@@ -815,8 +815,8 @@ define <4 x float> @test_cvtpd2ps(<2 x d
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtpd2ps %xmm0, %xmm0 # sched: [5:1.00]
 ; SKX-NEXT:    vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtpd2ps:
 ; BTVER2:       # BB#0:
@@ -886,9 +886,9 @@ define <4 x i32> @test_cvtps2dq(<4 x flo
 ; SKX-LABEL: test_cvtps2dq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtps2dq %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [4:0.50]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vcvtps2dq (%rdi), %xmm1 # sched: [10:0.50]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2dq:
 ; BTVER2:       # BB#0:
@@ -958,9 +958,9 @@ define <2 x double> @test_cvtps2pd(<4 x
 ; SKX-LABEL: test_cvtps2pd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtps2pd %xmm0, %xmm0 # sched: [5:1.00]
-; SKX-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [4:0.50]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vcvtps2pd (%rdi), %xmm1 # sched: [9:0.50]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtps2pd:
 ; BTVER2:       # BB#0:
@@ -1030,9 +1030,9 @@ define i32 @test_cvtsd2si(double %a0, do
 ; SKX-LABEL: test_cvtsd2si:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtsd2si %xmm0, %ecx # sched: [6:1.00]
-; SKX-NEXT:    vcvtsd2si (%rdi), %eax # sched: [6:1.00]
+; SKX-NEXT:    vcvtsd2si (%rdi), %eax # sched: [11:1.00]
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2si:
 ; BTVER2:       # BB#0:
@@ -1103,9 +1103,9 @@ define i64 @test_cvtsd2siq(double %a0, d
 ; SKX-LABEL: test_cvtsd2siq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtsd2si %xmm0, %rcx # sched: [6:1.00]
-; SKX-NEXT:    vcvtsd2si (%rdi), %rax # sched: [6:1.00]
+; SKX-NEXT:    vcvtsd2si (%rdi), %rax # sched: [11:1.00]
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2siq:
 ; BTVER2:       # BB#0:
@@ -1183,10 +1183,10 @@ define float @test_cvtsd2ss(double %a0,
 ; SKX-LABEL: test_cvtsd2ss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SKX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50]
+; SKX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
 ; SKX-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
-; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsd2ss:
 ; BTVER2:       # BB#0:
@@ -1257,8 +1257,8 @@ define double @test_cvtsi2sd(i32 %a0, i3
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00]
 ; SKX-NEXT:    vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
-; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2sd:
 ; BTVER2:       # BB#0:
@@ -1327,8 +1327,8 @@ define double @test_cvtsi2sdq(i64 %a0, i
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
 ; SKX-NEXT:    vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [9:1.00]
-; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtsi2sdq:
 ; BTVER2:       # BB#0:
@@ -1405,10 +1405,10 @@ define double @test_cvtss2sd(float %a0,
 ; SKX-LABEL: test_cvtss2sd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
-; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [1:0.50]
+; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
 ; SKX-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [5:1.00]
-; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvtss2sd:
 ; BTVER2:       # BB#0:
@@ -1480,8 +1480,8 @@ define <4 x i32> @test_cvttpd2dq(<2 x do
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvttpd2dq %xmm0, %xmm0 # sched: [5:1.00]
 ; SKX-NEXT:    vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvttpd2dq:
 ; BTVER2:       # BB#0:
@@ -1552,9 +1552,9 @@ define <4 x i32> @test_cvttps2dq(<4 x fl
 ; SKX-LABEL: test_cvttps2dq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvttps2dq %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [4:0.50]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vcvttps2dq (%rdi), %xmm1 # sched: [10:0.50]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvttps2dq:
 ; BTVER2:       # BB#0:
@@ -1622,9 +1622,9 @@ define i32 @test_cvttsd2si(double %a0, d
 ; SKX-LABEL: test_cvttsd2si:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvttsd2si %xmm0, %ecx # sched: [6:1.00]
-; SKX-NEXT:    vcvttsd2si (%rdi), %eax # sched: [6:1.00]
+; SKX-NEXT:    vcvttsd2si (%rdi), %eax # sched: [11:1.00]
 ; SKX-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvttsd2si:
 ; BTVER2:       # BB#0:
@@ -1692,9 +1692,9 @@ define i64 @test_cvttsd2siq(double %a0,
 ; SKX-LABEL: test_cvttsd2siq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vcvttsd2si %xmm0, %rcx # sched: [6:1.00]
-; SKX-NEXT:    vcvttsd2si (%rdi), %rax # sched: [6:1.00]
+; SKX-NEXT:    vcvttsd2si (%rdi), %rax # sched: [11:1.00]
 ; SKX-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_cvttsd2siq:
 ; BTVER2:       # BB#0:
@@ -1756,8 +1756,8 @@ define <2 x double> @test_divpd(<2 x dou
 ; SKX-LABEL: test_divpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vdivpd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
-; SKX-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vdivpd (%rdi), %xmm0, %xmm0 # sched: [20:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_divpd:
 ; BTVER2:       # BB#0:
@@ -1816,8 +1816,8 @@ define double @test_divsd(double %a0, do
 ; SKX-LABEL: test_divsd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0 # sched: [14:1.00]
-; SKX-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [14:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vdivsd (%rdi), %xmm0, %xmm0 # sched: [19:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_divsd:
 ; BTVER2:       # BB#0:
@@ -1876,7 +1876,7 @@ define void @test_lfence() {
 ; SKX-LABEL: test_lfence:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    lfence # sched: [2:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_lfence:
 ; BTVER2:       # BB#0:
@@ -1931,8 +1931,8 @@ define void @test_mfence() {
 ;
 ; SKX-LABEL: test_mfence:
 ; SKX:       # BB#0:
-; SKX-NEXT:    mfence # sched: [2:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    mfence # sched: [3:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_mfence:
 ; BTVER2:       # BB#0:
@@ -1985,8 +1985,8 @@ define void @test_maskmovdqu(<16 x i8> %
 ;
 ; SKX-LABEL: test_maskmovdqu:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmaskmovdqu %xmm1, %xmm0 # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maskmovdqu:
 ; BTVER2:       # BB#0:
@@ -2043,7 +2043,7 @@ define <2 x double> @test_maxpd(<2 x dou
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maxpd:
 ; BTVER2:       # BB#0:
@@ -2103,8 +2103,8 @@ define <2 x double> @test_maxsd(<2 x dou
 ; SKX-LABEL: test_maxsd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_maxsd:
 ; BTVER2:       # BB#0:
@@ -2165,7 +2165,7 @@ define <2 x double> @test_minpd(<2 x dou
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
 ; SKX-NEXT:    vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_minpd:
 ; BTVER2:       # BB#0:
@@ -2225,8 +2225,8 @@ define <2 x double> @test_minsd(<2 x dou
 ; SKX-LABEL: test_minsd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vminsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vminsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_minsd:
 ; BTVER2:       # BB#0:
@@ -2291,10 +2291,10 @@ define void @test_movapd(<2 x double> *%
 ;
 ; SKX-LABEL: test_movapd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovapd (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovapd (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovapd %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movapd:
 ; BTVER2:       # BB#0:
@@ -2360,10 +2360,10 @@ define void @test_movdqa(<2 x i64> *%a0,
 ;
 ; SKX-LABEL: test_movdqa:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vmovdqa %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movdqa:
 ; BTVER2:       # BB#0:
@@ -2429,10 +2429,10 @@ define void @test_movdqu(<2 x i64> *%a0,
 ;
 ; SKX-LABEL: test_movdqu:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vmovdqu (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vmovdqu %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movdqu:
 ; BTVER2:       # BB#0:
@@ -2516,13 +2516,13 @@ define i32 @test_movd(<4 x i32> %a0, i32
 ;
 ; SKX-LABEL: test_movd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [1:0.50]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SKX-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:0.50]
+; SKX-NEXT:    vmovd %edi, %xmm2 # sched: [1:1.00]
+; SKX-NEXT:    vpaddd %xmm2, %xmm0, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vmovd %xmm0, %eax # sched: [2:1.00]
-; SKX-NEXT:    vmovd %xmm1, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovd %xmm2, (%rsi) # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movd:
 ; BTVER2:       # BB#0:
@@ -2617,13 +2617,13 @@ define i64 @test_movd_64(<2 x i64> %a0,
 ;
 ; SKX-LABEL: test_movd_64:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovq %rdi, %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT:    vmovq %rdi, %xmm2 # sched: [1:1.00]
+; SKX-NEXT:    vpaddq %xmm2, %xmm0, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vmovq %xmm0, %rax # sched: [2:1.00]
-; SKX-NEXT:    vmovq %xmm1, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovq %xmm2, (%rsi) # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movd_64:
 ; BTVER2:       # BB#0:
@@ -2700,10 +2700,10 @@ define void @test_movhpd(<2 x double> %a
 ;
 ; SKX-LABEL: test_movhpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovhpd %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movhpd:
 ; BTVER2:       # BB#0:
@@ -2772,10 +2772,10 @@ define void @test_movlpd(<2 x double> %a
 ;
 ; SKX-LABEL: test_movlpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovlpd %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movlpd:
 ; BTVER2:       # BB#0:
@@ -2835,7 +2835,7 @@ define i32 @test_movmskpd(<2 x double> %
 ; SKX-LABEL: test_movmskpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovmskpd %xmm0, %eax # sched: [2:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movmskpd:
 ; BTVER2:       # BB#0:
@@ -2892,9 +2892,9 @@ define void @test_movntdqa(<2 x i64> %a0
 ;
 ; SKX-LABEL: test_movntdqa:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vmovntdq %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movntdqa:
 ; BTVER2:       # BB#0:
@@ -2951,9 +2951,9 @@ define void @test_movntpd(<2 x double> %
 ;
 ; SKX-LABEL: test_movntpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovntpd %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movntpd:
 ; BTVER2:       # BB#0:
@@ -3016,10 +3016,10 @@ define <2 x i64> @test_movq_mem(<2 x i64
 ;
 ; SKX-LABEL: test_movq_mem:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
 ; SKX-NEXT:    vmovq %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movq_mem:
 ; BTVER2:       # BB#0:
@@ -3083,9 +3083,9 @@ define <2 x i64> @test_movq_reg(<2 x i64
 ;
 ; SKX-LABEL: test_movq_reg:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33]
+; SKX-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movq_reg:
 ; BTVER2:       # BB#0:
@@ -3148,10 +3148,10 @@ define void @test_movsd_mem(double* %a0,
 ;
 ; SKX-LABEL: test_movsd_mem:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50]
-; SKX-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50]
+; SKX-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovsd %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movsd_mem:
 ; BTVER2:       # BB#0:
@@ -3213,7 +3213,7 @@ define <2 x double> @test_movsd_reg(<2 x
 ; SKX-LABEL: test_movsd_reg:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movsd_reg:
 ; BTVER2:       # BB#0:
@@ -3273,10 +3273,10 @@ define void @test_movupd(<2 x double> *%
 ;
 ; SKX-LABEL: test_movupd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovupd (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovupd (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm0, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovupd %xmm0, (%rsi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movupd:
 ; BTVER2:       # BB#0:
@@ -3336,9 +3336,9 @@ define <2 x double> @test_mulpd(<2 x dou
 ;
 ; SKX-LABEL: test_mulpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_mulpd:
 ; BTVER2:       # BB#0:
@@ -3396,9 +3396,9 @@ define double @test_mulsd(double %a0, do
 ;
 ; SKX-LABEL: test_mulsd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_mulsd:
 ; BTVER2:       # BB#0:
@@ -3462,10 +3462,10 @@ define <2 x double> @test_orpd(<2 x doub
 ;
 ; SKX-LABEL: test_orpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_orpd:
 ; BTVER2:       # BB#0:
@@ -3535,8 +3535,8 @@ define <8 x i16> @test_packssdw(<4 x i32
 ; SKX-LABEL: test_packssdw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_packssdw:
 ; BTVER2:       # BB#0:
@@ -3601,8 +3601,8 @@ define <16 x i8> @test_packsswb(<8 x i16
 ; SKX-LABEL: test_packsswb:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_packsswb:
 ; BTVER2:       # BB#0:
@@ -3667,8 +3667,8 @@ define <16 x i8> @test_packuswb(<8 x i16
 ; SKX-LABEL: test_packuswb:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_packuswb:
 ; BTVER2:       # BB#0:
@@ -3732,9 +3732,9 @@ define <16 x i8> @test_paddb(<16 x i8> %
 ;
 ; SKX-LABEL: test_paddb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpaddb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddb:
 ; BTVER2:       # BB#0:
@@ -3796,9 +3796,9 @@ define <4 x i32> @test_paddd(<4 x i32> %
 ;
 ; SKX-LABEL: test_paddd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpaddd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddd:
 ; BTVER2:       # BB#0:
@@ -3856,9 +3856,9 @@ define <2 x i64> @test_paddq(<2 x i64> %
 ;
 ; SKX-LABEL: test_paddq:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpaddq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddq:
 ; BTVER2:       # BB#0:
@@ -3920,9 +3920,9 @@ define <16 x i8> @test_paddsb(<16 x i8>
 ;
 ; SKX-LABEL: test_paddsb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddsb:
 ; BTVER2:       # BB#0:
@@ -3985,9 +3985,9 @@ define <8 x i16> @test_paddsw(<8 x i16>
 ;
 ; SKX-LABEL: test_paddsw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddsw:
 ; BTVER2:       # BB#0:
@@ -4050,9 +4050,9 @@ define <16 x i8> @test_paddusb(<16 x i8>
 ;
 ; SKX-LABEL: test_paddusb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddusb:
 ; BTVER2:       # BB#0:
@@ -4115,9 +4115,9 @@ define <8 x i16> @test_paddusw(<8 x i16>
 ;
 ; SKX-LABEL: test_paddusw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddusw:
 ; BTVER2:       # BB#0:
@@ -4180,9 +4180,9 @@ define <8 x i16> @test_paddw(<8 x i16> %
 ;
 ; SKX-LABEL: test_paddw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_paddw:
 ; BTVER2:       # BB#0:
@@ -4246,10 +4246,10 @@ define <2 x i64> @test_pand(<2 x i64> %a
 ;
 ; SKX-LABEL: test_pand:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpand (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pand:
 ; BTVER2:       # BB#0:
@@ -4322,10 +4322,10 @@ define <2 x i64> @test_pandn(<2 x i64> %
 ;
 ; SKX-LABEL: test_pandn:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpandn (%rdi), %xmm0, %xmm1 # sched: [7:0.50]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pandn:
 ; BTVER2:       # BB#0:
@@ -4392,9 +4392,9 @@ define <16 x i8> @test_pavgb(<16 x i8> %
 ;
 ; SKX-LABEL: test_pavgb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpavgb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pavgb:
 ; BTVER2:       # BB#0:
@@ -4466,9 +4466,9 @@ define <8 x i16> @test_pavgw(<8 x i16> %
 ;
 ; SKX-LABEL: test_pavgw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpavgw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pavgw:
 ; BTVER2:       # BB#0:
@@ -4544,11 +4544,11 @@ define <16 x i8> @test_pcmpeqb(<16 x i8>
 ;
 ; SKX-LABEL: test_pcmpeqb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpcmpeqb (%rdi), %xmm0, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vpcmpeqb (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korw %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqb:
 ; BTVER2:       # BB#0:
@@ -4618,11 +4618,11 @@ define <4 x i32> @test_pcmpeqd(<4 x i32>
 ;
 ; SKX-LABEL: test_pcmpeqd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korw %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqd:
 ; BTVER2:       # BB#0:
@@ -4692,11 +4692,11 @@ define <8 x i16> @test_pcmpeqw(<8 x i16>
 ;
 ; SKX-LABEL: test_pcmpeqw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpcmpeqw (%rdi), %xmm0, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vpcmpeqw (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korb %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqw:
 ; BTVER2:       # BB#0:
@@ -4767,11 +4767,11 @@ define <16 x i8> @test_pcmpgtb(<16 x i8>
 ;
 ; SKX-LABEL: test_pcmpgtb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpcmpgtb (%rdi), %xmm0, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vpcmpgtb (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korw %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtb:
 ; BTVER2:       # BB#0:
@@ -4842,11 +4842,11 @@ define <4 x i32> @test_pcmpgtd(<4 x i32>
 ;
 ; SKX-LABEL: test_pcmpgtd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k1
-; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korw %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtd:
 ; BTVER2:       # BB#0:
@@ -4917,11 +4917,11 @@ define <8 x i16> @test_pcmpgtw(<8 x i16>
 ;
 ; SKX-LABEL: test_pcmpgtw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
-; SKX-NEXT:    vpcmpgtw (%rdi), %xmm0, %k1
-; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 # sched: [3:1.00]
+; SKX-NEXT:    vpcmpgtw (%rdi), %xmm0, %k1 # sched: [9:1.00]
+; SKX-NEXT:    korb %k1, %k0, %k0 # sched: [1:1.00]
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtw:
 ; BTVER2:       # BB#0:
@@ -4985,7 +4985,7 @@ define i16 @test_pextrw(<8 x i16> %a0) {
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpextrw $6, %xmm0, %eax # sched: [3:1.00]
 ; SKX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # BB#0:
@@ -5046,8 +5046,8 @@ define <8 x i16> @test_pinsrw(<8 x i16>
 ; SKX-LABEL: test_pinsrw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
-; SKX-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpinsrw $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrw:
 ; BTVER2:       # BB#0:
@@ -5114,8 +5114,8 @@ define <4 x i32> @test_pmaddwd(<8 x i16>
 ; SKX-LABEL: test_pmaddwd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaddwd:
 ; BTVER2:       # BB#0:
@@ -5179,9 +5179,9 @@ define <8 x i16> @test_pmaxsw(<8 x i16>
 ;
 ; SKX-LABEL: test_pmaxsw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsw:
 ; BTVER2:       # BB#0:
@@ -5244,9 +5244,9 @@ define <16 x i8> @test_pmaxub(<16 x i8>
 ;
 ; SKX-LABEL: test_pmaxub:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpmaxub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxub:
 ; BTVER2:       # BB#0:
@@ -5309,9 +5309,9 @@ define <8 x i16> @test_pminsw(<8 x i16>
 ;
 ; SKX-LABEL: test_pminsw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpminsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pminsw:
 ; BTVER2:       # BB#0:
@@ -5374,9 +5374,9 @@ define <16 x i8> @test_pminub(<16 x i8>
 ;
 ; SKX-LABEL: test_pminub:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpminub (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pminub:
 ; BTVER2:       # BB#0:
@@ -5432,7 +5432,7 @@ define i32 @test_pmovmskb(<16 x i8> %a0)
 ; SKX-LABEL: test_pmovmskb:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovmskb %xmm0, %eax # sched: [2:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovmskb:
 ; BTVER2:       # BB#0:
@@ -5488,8 +5488,8 @@ define <8 x i16> @test_pmulhuw(<8 x i16>
 ; SKX-LABEL: test_pmulhuw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhuw:
 ; BTVER2:       # BB#0:
@@ -5549,8 +5549,8 @@ define <8 x i16> @test_pmulhw(<8 x i16>
 ; SKX-LABEL: test_pmulhw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmulhw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhw:
 ; BTVER2:       # BB#0:
@@ -5610,8 +5610,8 @@ define <8 x i16> @test_pmullw(<8 x i16>
 ; SKX-LABEL: test_pmullw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmullw:
 ; BTVER2:       # BB#0:
@@ -5678,8 +5678,8 @@ define <2 x i64> @test_pmuludq(<4 x i32>
 ; SKX-LABEL: test_pmuludq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmuludq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmuludq:
 ; BTVER2:       # BB#0:
@@ -5745,10 +5745,10 @@ define <2 x i64> @test_por(<2 x i64> %a0
 ;
 ; SKX-LABEL: test_por:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_por:
 ; BTVER2:       # BB#0:
@@ -5818,8 +5818,8 @@ define <2 x i64> @test_psadbw(<16 x i8>
 ; SKX-LABEL: test_psadbw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psadbw:
 ; BTVER2:       # BB#0:
@@ -5888,9 +5888,9 @@ define <4 x i32> @test_pshufd(<4 x i32>
 ; SKX-LABEL: test_pshufd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [1:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [7:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pshufd:
 ; BTVER2:       # BB#0:
@@ -5960,9 +5960,9 @@ define <8 x i16> @test_pshufhw(<8 x i16>
 ; SKX-LABEL: test_pshufhw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [1:1.00]
-; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [7:1.00]
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pshufhw:
 ; BTVER2:       # BB#0:
@@ -6032,9 +6032,9 @@ define <8 x i16> @test_pshuflw(<8 x i16>
 ; SKX-LABEL: test_pshuflw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pshuflw:
 ; BTVER2:       # BB#0:
@@ -6102,9 +6102,9 @@ define <4 x i32> @test_pslld(<4 x i32> %
 ; SKX-LABEL: test_pslld:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpslld (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpslld $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pslld:
 ; BTVER2:       # BB#0:
@@ -6168,7 +6168,7 @@ define <4 x i32> @test_pslldq(<4 x i32>
 ; SKX-LABEL: test_pslldq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pslldq:
 ; BTVER2:       # BB#0:
@@ -6229,9 +6229,9 @@ define <2 x i64> @test_psllq(<2 x i64> %
 ; SKX-LABEL: test_psllq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsllq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psllq:
 ; BTVER2:       # BB#0:
@@ -6301,9 +6301,9 @@ define <8 x i16> @test_psllw(<8 x i16> %
 ; SKX-LABEL: test_psllw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsllw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psllw:
 ; BTVER2:       # BB#0:
@@ -6373,9 +6373,9 @@ define <4 x i32> @test_psrad(<4 x i32> %
 ; SKX-LABEL: test_psrad:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsrad (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psrad:
 ; BTVER2:       # BB#0:
@@ -6445,9 +6445,9 @@ define <8 x i16> @test_psraw(<8 x i16> %
 ; SKX-LABEL: test_psraw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsraw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psraw:
 ; BTVER2:       # BB#0:
@@ -6517,9 +6517,9 @@ define <4 x i32> @test_psrld(<4 x i32> %
 ; SKX-LABEL: test_psrld:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsrld (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psrld:
 ; BTVER2:       # BB#0:
@@ -6583,7 +6583,7 @@ define <4 x i32> @test_psrldq(<4 x i32>
 ; SKX-LABEL: test_psrldq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psrldq:
 ; BTVER2:       # BB#0:
@@ -6644,9 +6644,9 @@ define <2 x i64> @test_psrlq(<2 x i64> %
 ; SKX-LABEL: test_psrlq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsrlq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psrlq:
 ; BTVER2:       # BB#0:
@@ -6716,9 +6716,9 @@ define <8 x i16> @test_psrlw(<8 x i16> %
 ; SKX-LABEL: test_psrlw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
-; SKX-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsrlw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psrlw:
 ; BTVER2:       # BB#0:
@@ -6785,9 +6785,9 @@ define <16 x i8> @test_psubb(<16 x i8> %
 ;
 ; SKX-LABEL: test_psubb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpsubb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubb:
 ; BTVER2:       # BB#0:
@@ -6849,9 +6849,9 @@ define <4 x i32> @test_psubd(<4 x i32> %
 ;
 ; SKX-LABEL: test_psubd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpsubd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubd:
 ; BTVER2:       # BB#0:
@@ -6909,9 +6909,9 @@ define <2 x i64> @test_psubq(<2 x i64> %
 ;
 ; SKX-LABEL: test_psubq:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpsubq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubq:
 ; BTVER2:       # BB#0:
@@ -6973,9 +6973,9 @@ define <16 x i8> @test_psubsb(<16 x i8>
 ;
 ; SKX-LABEL: test_psubsb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubsb:
 ; BTVER2:       # BB#0:
@@ -7038,9 +7038,9 @@ define <8 x i16> @test_psubsw(<8 x i16>
 ;
 ; SKX-LABEL: test_psubsw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubsw:
 ; BTVER2:       # BB#0:
@@ -7103,9 +7103,9 @@ define <16 x i8> @test_psubusb(<16 x i8>
 ;
 ; SKX-LABEL: test_psubusb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubusb:
 ; BTVER2:       # BB#0:
@@ -7168,9 +7168,9 @@ define <8 x i16> @test_psubusw(<8 x i16>
 ;
 ; SKX-LABEL: test_psubusw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubusw:
 ; BTVER2:       # BB#0:
@@ -7233,9 +7233,9 @@ define <8 x i16> @test_psubw(<8 x i16> %
 ;
 ; SKX-LABEL: test_psubw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psubw:
 ; BTVER2:       # BB#0:
@@ -7298,8 +7298,8 @@ define <16 x i8> @test_punpckhbw(<16 x i
 ; SKX-LABEL: test_punpckhbw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00]
-; SKX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhbw:
 ; BTVER2:       # BB#0:
@@ -7366,9 +7366,9 @@ define <4 x i32> @test_punpckhdq(<4 x i3
 ; SKX-LABEL: test_punpckhdq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [1:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [7:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhdq:
 ; BTVER2:       # BB#0:
@@ -7436,9 +7436,9 @@ define <2 x i64> @test_punpckhqdq(<2 x i
 ; SKX-LABEL: test_punpckhqdq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhqdq:
 ; BTVER2:       # BB#0:
@@ -7504,8 +7504,8 @@ define <8 x i16> @test_punpckhwd(<8 x i1
 ; SKX-LABEL: test_punpckhwd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpckhwd:
 ; BTVER2:       # BB#0:
@@ -7568,8 +7568,8 @@ define <16 x i8> @test_punpcklbw(<16 x i
 ; SKX-LABEL: test_punpcklbw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklbw:
 ; BTVER2:       # BB#0:
@@ -7636,9 +7636,9 @@ define <4 x i32> @test_punpckldq(<4 x i3
 ; SKX-LABEL: test_punpckldq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [1:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpckldq:
 ; BTVER2:       # BB#0:
@@ -7706,9 +7706,9 @@ define <2 x i64> @test_punpcklqdq(<2 x i
 ; SKX-LABEL: test_punpcklqdq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklqdq:
 ; BTVER2:       # BB#0:
@@ -7774,8 +7774,8 @@ define <8 x i16> @test_punpcklwd(<8 x i1
 ; SKX-LABEL: test_punpcklwd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_punpcklwd:
 ; BTVER2:       # BB#0:
@@ -7839,10 +7839,10 @@ define <2 x i64> @test_pxor(<2 x i64> %a
 ;
 ; SKX-LABEL: test_pxor:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vpxor (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pxor:
 ; BTVER2:       # BB#0:
@@ -7910,9 +7910,9 @@ define <2 x double> @test_shufpd(<2 x do
 ; SKX-LABEL: test_shufpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [1:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [7:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_shufpd:
 ; BTVER2:       # BB#0:
@@ -7981,9 +7981,9 @@ define <2 x double> @test_sqrtpd(<2 x do
 ; SKX-LABEL: test_sqrtpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vsqrtpd %xmm0, %xmm0 # sched: [18:1.00]
-; SKX-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [18:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vsqrtpd (%rdi), %xmm1 # sched: [24:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtpd:
 ; BTVER2:       # BB#0:
@@ -8060,10 +8060,10 @@ define <2 x double> @test_sqrtsd(<2 x do
 ; SKX-LABEL: test_sqrtsd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00]
-; SKX-NEXT:    vmovapd (%rdi), %xmm1 # sched: [1:0.50]
+; SKX-NEXT:    vmovapd (%rdi), %xmm1 # sched: [6:0.50]
 ; SKX-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [18:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_sqrtsd:
 ; BTVER2:       # BB#0:
@@ -8127,9 +8127,9 @@ define <2 x double> @test_subpd(<2 x dou
 ;
 ; SKX-LABEL: test_subpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_subpd:
 ; BTVER2:       # BB#0:
@@ -8187,9 +8187,9 @@ define double @test_subsd(double %a0, do
 ;
 ; SKX-LABEL: test_subsd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_subsd:
 ; BTVER2:       # BB#0:
@@ -8296,16 +8296,16 @@ define i32 @test_ucomisd(<2 x double> %a
 ; SKX-LABEL: test_ucomisd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vucomisd %xmm1, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %cl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %cl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; SKX-NEXT:    vucomisd (%rdi), %xmm0 # sched: [8:1.00]
-; SKX-NEXT:    setnp %al # sched: [1:1.00]
-; SKX-NEXT:    sete %dl # sched: [1:1.00]
+; SKX-NEXT:    setnp %al # sched: [1:0.50]
+; SKX-NEXT:    sete %dl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %dl # sched: [1:0.25]
 ; SKX-NEXT:    orb %cl, %dl # sched: [1:0.25]
 ; SKX-NEXT:    movzbl %dl, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_ucomisd:
 ; BTVER2:       # BB#0:
@@ -8388,9 +8388,9 @@ define <2 x double> @test_unpckhpd(<2 x
 ; SKX-LABEL: test_unpckhpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [1:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_unpckhpd:
 ; BTVER2:       # BB#0:
@@ -8464,9 +8464,9 @@ define <2 x double> @test_unpcklpd(<2 x
 ; SKX-LABEL: test_unpcklpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [1:1.00]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_unpcklpd:
 ; BTVER2:       # BB#0:
@@ -8533,10 +8533,10 @@ define <2 x double> @test_xorpd(<2 x dou
 ;
 ; SKX-LABEL: test_xorpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    vxorpd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_xorpd:
 ; BTVER2:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/sse3-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse3-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse3-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse3-schedule.ll Sun Oct  8 05:52:54 2017
@@ -49,9 +49,9 @@ define <2 x double> @test_addsubpd(<2 x
 ;
 ; SKX-LABEL: test_addsubpd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_addsubpd:
 ; BTVER2:       # BB#0:
@@ -110,9 +110,9 @@ define <4 x float> @test_addsubps(<4 x f
 ;
 ; SKX-LABEL: test_addsubps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_addsubps:
 ; BTVER2:       # BB#0:
@@ -172,8 +172,8 @@ define <2 x double> @test_haddpd(<2 x do
 ; SKX-LABEL: test_haddpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # BB#0:
@@ -233,8 +233,8 @@ define <4 x float> @test_haddps(<4 x flo
 ; SKX-LABEL: test_haddps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # BB#0:
@@ -294,8 +294,8 @@ define <2 x double> @test_hsubpd(<2 x do
 ; SKX-LABEL: test_hsubpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # BB#0:
@@ -355,8 +355,8 @@ define <4 x float> @test_hsubps(<4 x flo
 ; SKX-LABEL: test_hsubps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # BB#0:
@@ -411,8 +411,8 @@ define <16 x i8> @test_lddqu(i8* %a0) {
 ;
 ; SKX-LABEL: test_lddqu:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vlddqu (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vlddqu (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_lddqu:
 ; BTVER2:       # BB#0:
@@ -476,7 +476,7 @@ define void @test_monitor(i8* %a0, i32 %
 ; SKX-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
 ; SKX-NEXT:    movl %esi, %ecx # sched: [1:0.25]
 ; SKX-NEXT:    monitor # sched: [100:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_monitor:
 ; BTVER2:       # BB#0:
@@ -543,9 +543,9 @@ define <2 x double> @test_movddup(<2 x d
 ; SKX-LABEL: test_movddup:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
-; SKX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [5:0.50]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movddup:
 ; BTVER2:       # BB#0:
@@ -614,9 +614,9 @@ define <4 x float> @test_movshdup(<4 x f
 ; SKX-LABEL: test_movshdup:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
-; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [1:0.50]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movshdup:
 ; BTVER2:       # BB#0:
@@ -685,9 +685,9 @@ define <4 x float> @test_movsldup(<4 x f
 ; SKX-LABEL: test_movsldup:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
-; SKX-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [1:0.50]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movsldup:
 ; BTVER2:       # BB#0:
@@ -757,7 +757,7 @@ define void @test_mwait(i32 %a0, i32 %a1
 ; SKX-NEXT:    movl %edi, %ecx # sched: [1:0.25]
 ; SKX-NEXT:    movl %esi, %eax # sched: [1:0.25]
 ; SKX-NEXT:    mwait # sched: [20:2.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_mwait:
 ; BTVER2:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/sse41-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse41-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse41-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse41-schedule.ll Sun Oct  8 05:52:54 2017
@@ -48,10 +48,10 @@ define <2 x double> @test_blendpd(<2 x d
 ; SKX-LABEL: test_blendpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovapd (%rdi), %xmm2 # sched: [1:0.50]
-; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vmovapd (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
 ; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_blendpd:
 ; BTVER2:       # BB#0:
@@ -106,9 +106,9 @@ define <4 x float> @test_blendps(<4 x fl
 ;
 ; SKX-LABEL: test_blendps:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
-; SKX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.33]
+; SKX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_blendps:
 ; BTVER2:       # BB#0:
@@ -167,8 +167,8 @@ define <2 x double> @test_blendvpd(<2 x
 ; SKX-LABEL: test_blendvpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
-; SKX-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:0.67]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_blendvpd:
 ; BTVER2:       # BB#0:
@@ -228,8 +228,8 @@ define <4 x float> @test_blendvps(<4 x f
 ; SKX-LABEL: test_blendvps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
-; SKX-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:0.67]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_blendvps:
 ; BTVER2:       # BB#0:
@@ -283,8 +283,8 @@ define <2 x double> @test_dppd(<2 x doub
 ; SKX-LABEL: test_dppd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [9:1.00]
-; SKX-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [15:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_dppd:
 ; BTVER2:       # BB#0:
@@ -338,8 +338,8 @@ define <4 x float> @test_dpps(<4 x float
 ; SKX-LABEL: test_dpps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [13:1.33]
-; SKX-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [13:1.33]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [19:1.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_dpps:
 ; BTVER2:       # BB#0:
@@ -393,8 +393,8 @@ define <4 x float> @test_insertps(<4 x f
 ; SKX-LABEL: test_insertps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:1.00]
-; SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_insertps:
 ; BTVER2:       # BB#0:
@@ -442,8 +442,8 @@ define <2 x i64> @test_movntdqa(i8* %a0)
 ;
 ; SKX-LABEL: test_movntdqa:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmovntdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_movntdqa:
 ; BTVER2:       # BB#0:
@@ -493,8 +493,8 @@ define <8 x i16> @test_mpsadbw(<16 x i8>
 ; SKX-LABEL: test_mpsadbw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [4:2.00]
-; SKX-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [4:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [10:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_mpsadbw:
 ; BTVER2:       # BB#0:
@@ -549,8 +549,8 @@ define <8 x i16> @test_packusdw(<4 x i32
 ; SKX-LABEL: test_packusdw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_packusdw:
 ; BTVER2:       # BB#0:
@@ -611,8 +611,8 @@ define <16 x i8> @test_pblendvb(<16 x i8
 ; SKX-LABEL: test_pblendvb:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:0.67]
-; SKX-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [2:0.67]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pblendvb:
 ; BTVER2:       # BB#0:
@@ -666,8 +666,8 @@ define <8 x i16> @test_pblendw(<8 x i16>
 ; SKX-LABEL: test_pblendw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pblendw:
 ; BTVER2:       # BB#0:
@@ -719,11 +719,11 @@ define <2 x i64> @test_pcmpeqq(<2 x i64>
 ;
 ; SKX-LABEL: test_pcmpeqq:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
+; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
+; SKX-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0 # sched: [9:1.00]
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpeqq:
 ; BTVER2:       # BB#0:
@@ -778,8 +778,8 @@ define i32 @test_pextrb(<16 x i8> %a0, i
 ; SKX-LABEL: test_pextrb:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpextrb $3, %xmm0, %eax # sched: [3:1.00]
-; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pextrb:
 ; BTVER2:       # BB#0:
@@ -833,8 +833,8 @@ define i32 @test_pextrd(<4 x i32> %a0, i
 ; SKX-LABEL: test_pextrd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpextrd $3, %xmm0, %eax # sched: [3:1.00]
-; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pextrd:
 ; BTVER2:       # BB#0:
@@ -887,8 +887,8 @@ define i64 @test_pextrq(<2 x i64> %a0, <
 ; SKX-LABEL: test_pextrq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpextrq $1, %xmm0, %rax # sched: [3:1.00]
-; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pextrq:
 ; BTVER2:       # BB#0:
@@ -941,8 +941,8 @@ define i32 @test_pextrw(<8 x i16> %a0, i
 ; SKX-LABEL: test_pextrw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpextrw $3, %xmm0, %eax # sched: [3:1.00]
-; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi) # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pextrw:
 ; BTVER2:       # BB#0:
@@ -995,9 +995,9 @@ define <8 x i16> @test_phminposuw(<8 x i
 ;
 ; SKX-LABEL: test_phminposuw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [4:0.50]
+; SKX-NEXT:    vphminposuw (%rdi), %xmm0 # sched: [10:0.50]
 ; SKX-NEXT:    vphminposuw %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_phminposuw:
 ; BTVER2:       # BB#0:
@@ -1051,8 +1051,8 @@ define <16 x i8> @test_pinsrb(<16 x i8>
 ; SKX-LABEL: test_pinsrb:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
-; SKX-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrb:
 ; BTVER2:       # BB#0:
@@ -1105,8 +1105,8 @@ define <4 x i32> @test_pinsrd(<4 x i32>
 ; SKX-LABEL: test_pinsrd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [2:2.00]
-; SKX-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrd:
 ; BTVER2:       # BB#0:
@@ -1164,9 +1164,9 @@ define <2 x i64> @test_pinsrq(<2 x i64>
 ; SKX-LABEL: test_pinsrq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [2:2.00]
-; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [6:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pinsrq:
 ; BTVER2:       # BB#0:
@@ -1221,9 +1221,9 @@ define <16 x i8> @test_pmaxsb(<16 x i8>
 ;
 ; SKX-LABEL: test_pmaxsb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsb:
 ; BTVER2:       # BB#0:
@@ -1276,9 +1276,9 @@ define <4 x i32> @test_pmaxsd(<4 x i32>
 ;
 ; SKX-LABEL: test_pmaxsd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxsd:
 ; BTVER2:       # BB#0:
@@ -1331,9 +1331,9 @@ define <4 x i32> @test_pmaxud(<4 x i32>
 ;
 ; SKX-LABEL: test_pmaxud:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpmaxud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxud:
 ; BTVER2:       # BB#0:
@@ -1386,9 +1386,9 @@ define <8 x i16> @test_pmaxuw(<8 x i16>
 ;
 ; SKX-LABEL: test_pmaxuw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaxuw:
 ; BTVER2:       # BB#0:
@@ -1441,9 +1441,9 @@ define <16 x i8> @test_pminsb(<16 x i8>
 ;
 ; SKX-LABEL: test_pminsb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpminsb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pminsb:
 ; BTVER2:       # BB#0:
@@ -1496,9 +1496,9 @@ define <4 x i32> @test_pminsd(<4 x i32>
 ;
 ; SKX-LABEL: test_pminsd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpminsd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pminsd:
 ; BTVER2:       # BB#0:
@@ -1551,9 +1551,9 @@ define <4 x i32> @test_pminud(<4 x i32>
 ;
 ; SKX-LABEL: test_pminud:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpminud (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pminud:
 ; BTVER2:       # BB#0:
@@ -1606,9 +1606,9 @@ define <8 x i16> @test_pminuw(<8 x i16>
 ;
 ; SKX-LABEL: test_pminuw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpminuw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pminuw:
 ; BTVER2:       # BB#0:
@@ -1668,9 +1668,9 @@ define <8 x i16> @test_pmovsxbw(<16 x i8
 ; SKX-LABEL: test_pmovsxbw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovsxbw %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovsxbw (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxbw:
 ; BTVER2:       # BB#0:
@@ -1733,9 +1733,9 @@ define <4 x i32> @test_pmovsxbd(<16 x i8
 ; SKX-LABEL: test_pmovsxbd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovsxbd %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovsxbd (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxbd:
 ; BTVER2:       # BB#0:
@@ -1798,9 +1798,9 @@ define <2 x i64> @test_pmovsxbq(<16 x i8
 ; SKX-LABEL: test_pmovsxbq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovsxbq %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovsxbq (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxbq:
 ; BTVER2:       # BB#0:
@@ -1863,9 +1863,9 @@ define <2 x i64> @test_pmovsxdq(<4 x i32
 ; SKX-LABEL: test_pmovsxdq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovsxdq %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovsxdq (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxdq:
 ; BTVER2:       # BB#0:
@@ -1928,9 +1928,9 @@ define <4 x i32> @test_pmovsxwd(<8 x i16
 ; SKX-LABEL: test_pmovsxwd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovsxwd %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovsxwd (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxwd:
 ; BTVER2:       # BB#0:
@@ -1993,9 +1993,9 @@ define <2 x i64> @test_pmovsxwq(<8 x i16
 ; SKX-LABEL: test_pmovsxwq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovsxwq %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovsxwq (%rdi), %xmm1 # sched: [6:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovsxwq:
 ; BTVER2:       # BB#0:
@@ -2058,9 +2058,9 @@ define <8 x i16> @test_pmovzxbw(<16 x i8
 ; SKX-LABEL: test_pmovzxbw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00]
-; SKX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [1:1.00]
-; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [6:1.00]
+; SKX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxbw:
 ; BTVER2:       # BB#0:
@@ -2123,9 +2123,9 @@ define <4 x i32> @test_pmovzxbd(<16 x i8
 ; SKX-LABEL: test_pmovzxbd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00]
-; SKX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [1:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [6:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxbd:
 ; BTVER2:       # BB#0:
@@ -2188,9 +2188,9 @@ define <2 x i64> @test_pmovzxbq(<16 x i8
 ; SKX-LABEL: test_pmovzxbq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
-; SKX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [6:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxbq:
 ; BTVER2:       # BB#0:
@@ -2253,9 +2253,9 @@ define <2 x i64> @test_pmovzxdq(<4 x i32
 ; SKX-LABEL: test_pmovzxdq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:1.00]
-; SKX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [6:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxdq:
 ; BTVER2:       # BB#0:
@@ -2318,9 +2318,9 @@ define <4 x i32> @test_pmovzxwd(<8 x i16
 ; SKX-LABEL: test_pmovzxwd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00]
-; SKX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [1:1.00]
-; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [6:1.00]
+; SKX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxwd:
 ; BTVER2:       # BB#0:
@@ -2383,9 +2383,9 @@ define <2 x i64> @test_pmovzxwq(<8 x i16
 ; SKX-LABEL: test_pmovzxwq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:1.00]
-; SKX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [1:1.00]
-; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [6:1.00]
+; SKX-NEXT:    vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmovzxwq:
 ; BTVER2:       # BB#0:
@@ -2442,8 +2442,8 @@ define <2 x i64> @test_pmuldq(<4 x i32>
 ; SKX-LABEL: test_pmuldq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmuldq (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmuldq:
 ; BTVER2:       # BB#0:
@@ -2498,8 +2498,8 @@ define <4 x i32> @test_pmulld(<4 x i32>
 ; SKX-LABEL: test_pmulld:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0 # sched: [8:0.67]
-; SKX-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmulld (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmulld:
 ; BTVER2:       # BB#0:
@@ -2572,12 +2572,12 @@ define i32 @test_ptest(<2 x i64> %a0, <2
 ; SKX-LABEL: test_ptest:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vptest %xmm1, %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    setb %al # sched: [1:1.00]
-; SKX-NEXT:    vptest (%rdi), %xmm0 # sched: [3:1.00]
-; SKX-NEXT:    setb %cl # sched: [1:1.00]
+; SKX-NEXT:    setb %al # sched: [1:0.50]
+; SKX-NEXT:    vptest (%rdi), %xmm0 # sched: [9:1.00]
+; SKX-NEXT:    setb %cl # sched: [1:0.50]
 ; SKX-NEXT:    andb %al, %cl # sched: [1:0.25]
 ; SKX-NEXT:    movzbl %cl, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_ptest:
 ; BTVER2:       # BB#0:
@@ -2646,9 +2646,9 @@ define <2 x double> @test_roundpd(<2 x d
 ; SKX-LABEL: test_roundpd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vroundpd $7, %xmm0, %xmm0 # sched: [8:0.67]
-; SKX-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [8:0.67]
-; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vroundpd $7, (%rdi), %xmm1 # sched: [14:0.67]
+; SKX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_roundpd:
 ; BTVER2:       # BB#0:
@@ -2711,9 +2711,9 @@ define <4 x float> @test_roundps(<4 x fl
 ; SKX-LABEL: test_roundps:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vroundps $7, %xmm0, %xmm0 # sched: [8:0.67]
-; SKX-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [8:0.67]
-; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vroundps $7, (%rdi), %xmm1 # sched: [14:0.67]
+; SKX-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_roundps:
 ; BTVER2:       # BB#0:
@@ -2777,9 +2777,9 @@ define <2 x double> @test_roundsd(<2 x d
 ; SKX-LABEL: test_roundsd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
-; SKX-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
-; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKX-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_roundsd:
 ; BTVER2:       # BB#0:
@@ -2843,9 +2843,9 @@ define <4 x float> @test_roundss(<4 x fl
 ; SKX-LABEL: test_roundss:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [8:0.67]
-; SKX-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:0.67]
-; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [14:0.67]
+; SKX-NEXT:    vaddps %xmm0, %xmm1, %xmm0 # sched: [4:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_roundss:
 ; BTVER2:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/sse42-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse42-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse42-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse42-schedule.ll Sun Oct  8 05:52:54 2017
@@ -50,7 +50,7 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1,
 ; SKX-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
 ; SKX-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_8:
 ; BTVER2:       # BB#0:
@@ -113,7 +113,7 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1
 ; SKX-NEXT:    crc32w %si, %edi # sched: [3:1.00]
 ; SKX-NEXT:    crc32w (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_16:
 ; BTVER2:       # BB#0:
@@ -176,7 +176,7 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1
 ; SKX-NEXT:    crc32l %esi, %edi # sched: [3:1.00]
 ; SKX-NEXT:    crc32l (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movl %edi, %eax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: crc32_32_32:
 ; BTVER2:       # BB#0:
@@ -239,7 +239,7 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1,
 ; SKX-NEXT:    crc32b %sil, %edi # sched: [3:1.00]
 ; SKX-NEXT:    crc32b (%rdx), %edi # sched: [8:1.00]
 ; SKX-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: crc32_64_8:
 ; BTVER2:       # BB#0:
@@ -302,7 +302,7 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1
 ; SKX-NEXT:    crc32q %rsi, %rdi # sched: [3:1.00]
 ; SKX-NEXT:    crc32q (%rdx), %rdi # sched: [8:1.00]
 ; SKX-NEXT:    movq %rdi, %rax # sched: [1:0.25]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: crc32_64_64:
 ; BTVER2:       # BB#0:
@@ -398,10 +398,10 @@ define i32 @test_pcmpestri(<16 x i8> %a0
 ; SKX-NEXT:    movl %ecx, %esi # sched: [1:0.25]
 ; SKX-NEXT:    movl $7, %eax # sched: [1:0.25]
 ; SKX-NEXT:    movl $7, %edx # sched: [1:0.25]
-; SKX-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [18:4.00]
+; SKX-NEXT:    vpcmpestri $7, (%rdi), %xmm0 # sched: [24:4.00]
 ; SKX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; SKX-NEXT:    leal (%rcx,%rsi), %eax # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpestri:
 ; BTVER2:       # BB#0:
@@ -494,8 +494,8 @@ define <16 x i8> @test_pcmpestrm(<16 x i
 ; SKX-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 # sched: [19:4.00]
 ; SKX-NEXT:    movl $7, %eax # sched: [1:0.25]
 ; SKX-NEXT:    movl $7, %edx # sched: [1:0.25]
-; SKX-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [19:4.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpcmpestrm $7, (%rdi), %xmm0 # sched: [25:4.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpestrm:
 ; BTVER2:       # BB#0:
@@ -573,10 +573,10 @@ define i32 @test_pcmpistri(<16 x i8> %a0
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpcmpistri $7, %xmm1, %xmm0 # sched: [10:3.00]
 ; SKX-NEXT:    movl %ecx, %eax # sched: [1:0.25]
-; SKX-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [10:3.00]
+; SKX-NEXT:    vpcmpistri $7, (%rdi), %xmm0 # sched: [16:3.00]
 ; SKX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
 ; SKX-NEXT:    leal (%rcx,%rax), %eax # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpistri:
 ; BTVER2:       # BB#0:
@@ -637,8 +637,8 @@ define <16 x i8> @test_pcmpistrm(<16 x i
 ; SKX-LABEL: test_pcmpistrm:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 # sched: [10:3.00]
-; SKX-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [10:3.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpcmpistrm $7, (%rdi), %xmm0 # sched: [16:3.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpistrm:
 ; BTVER2:       # BB#0:
@@ -691,11 +691,11 @@ define <2 x i64> @test_pcmpgtq(<2 x i64>
 ;
 ; SKX-LABEL: test_pcmpgtq:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; SKX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 # sched: [3:1.00]
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
+; SKX-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0 # sched: [9:1.00]
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pcmpgtq:
 ; BTVER2:       # BB#0:
@@ -750,8 +750,8 @@ define <2 x i64> @test_pclmulqdq(<2 x i6
 ; SKX-LABEL: test_pclmulqdq:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [6:1.00]
-; SKX-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pclmulqdq:
 ; BTVER2:       # BB#0:

Modified: llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll?rev=315175&r1=315174&r2=315175&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/ssse3-schedule.ll Sun Oct  8 05:52:54 2017
@@ -56,10 +56,10 @@ define <16 x i8> @test_pabsb(<16 x i8> %
 ;
 ; SKX-LABEL: test_pabsb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpabsb (%rdi), %xmm1 # sched: [1:0.50]
-; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpabsb %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpabsb (%rdi), %xmm1 # sched: [7:0.50]
+; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pabsb:
 ; BTVER2:       # BB#0:
@@ -128,10 +128,10 @@ define <4 x i32> @test_pabsd(<4 x i32> %
 ;
 ; SKX-LABEL: test_pabsd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpabsd (%rdi), %xmm1 # sched: [1:0.50]
-; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpabsd %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpabsd (%rdi), %xmm1 # sched: [7:0.50]
+; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pabsd:
 ; BTVER2:       # BB#0:
@@ -200,10 +200,10 @@ define <8 x i16> @test_pabsw(<8 x i16> %
 ;
 ; SKX-LABEL: test_pabsw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpabsw (%rdi), %xmm1 # sched: [1:0.50]
-; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpabsw %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpabsw (%rdi), %xmm1 # sched: [7:0.50]
+; SKX-NEXT:    vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pabsw:
 ; BTVER2:       # BB#0:
@@ -271,8 +271,8 @@ define <8 x i16> @test_palignr(<8 x i16>
 ; SKX-LABEL: test_palignr:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:1.00]
-; SKX-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_palignr:
 ; BTVER2:       # BB#0:
@@ -331,8 +331,8 @@ define <4 x i32> @test_phaddd(<4 x i32>
 ; SKX-LABEL: test_phaddd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_phaddd:
 ; BTVER2:       # BB#0:
@@ -392,8 +392,8 @@ define <8 x i16> @test_phaddsw(<8 x i16>
 ; SKX-LABEL: test_phaddsw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_phaddsw:
 ; BTVER2:       # BB#0:
@@ -453,8 +453,8 @@ define <8 x i16> @test_phaddw(<8 x i16>
 ; SKX-LABEL: test_phaddw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_phaddw:
 ; BTVER2:       # BB#0:
@@ -514,8 +514,8 @@ define <4 x i32> @test_phsubd(<4 x i32>
 ; SKX-LABEL: test_phsubd:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_phsubd:
 ; BTVER2:       # BB#0:
@@ -575,8 +575,8 @@ define <8 x i16> @test_phsubsw(<8 x i16>
 ; SKX-LABEL: test_phsubsw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_phsubsw:
 ; BTVER2:       # BB#0:
@@ -636,8 +636,8 @@ define <8 x i16> @test_phsubw(<8 x i16>
 ; SKX-LABEL: test_phsubw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [3:2.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_phsubw:
 ; BTVER2:       # BB#0:
@@ -697,8 +697,8 @@ define <8 x i16> @test_pmaddubsw(<16 x i
 ; SKX-LABEL: test_pmaddubsw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmaddubsw:
 ; BTVER2:       # BB#0:
@@ -759,8 +759,8 @@ define <8 x i16> @test_pmulhrsw(<8 x i16
 ; SKX-LABEL: test_pmulhrsw:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:0.33]
-; SKX-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pmulhrsw:
 ; BTVER2:       # BB#0:
@@ -820,8 +820,8 @@ define <16 x i8> @test_pshufb(<16 x i8>
 ; SKX-LABEL: test_pshufb:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_pshufb:
 ; BTVER2:       # BB#0:
@@ -884,9 +884,9 @@ define <16 x i8> @test_psignb(<16 x i8>
 ;
 ; SKX-LABEL: test_psignb:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psignb:
 ; BTVER2:       # BB#0:
@@ -949,9 +949,9 @@ define <4 x i32> @test_psignd(<4 x i32>
 ;
 ; SKX-LABEL: test_psignd:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psignd:
 ; BTVER2:       # BB#0:
@@ -1014,9 +1014,9 @@ define <8 x i16> @test_psignw(<8 x i16>
 ;
 ; SKX-LABEL: test_psignw:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
-; SKX-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [1:0.50]
-; SKX-NEXT:    retq # sched: [2:1.00]
+; SKX-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
+; SKX-NEXT:    vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
+; SKX-NEXT:    retq # sched: [7:1.00]
 ;
 ; BTVER2-LABEL: test_psignw:
 ; BTVER2:       # BB#0:




More information about the llvm-commits mailing list