[llvm] [X86] visitSelect - widen select(cond,extract_subvector(shuffle(vec0)),vec1) if it will create a mask instruction (PR #115223)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 7 02:42:24 PST 2024


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/115223

>From 3298449bdd4fc9619048eba8bfd0bdbda906c9dc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 6 Nov 2024 21:59:34 +0000
Subject: [PATCH] [X86] visitSelect - widen
 select(cond,extract_subvector(shuffle(vec0)),vec1) if it will create a mask
 instruction

This patch extends the existing fold "select(mask, extract_subvector(shuffle(x)), zero) --> extract_subvector(select(insert_subvector(mask), shuffle(x), zero))", to also handle the non-zero case.

I've put in a restriction for VPERMV3 3 vector operands shuffles to only work with the zero select as in most circumstances we are not selecting with either of the source vectors (the only case the mask instructions match).

We should be able to generalise this in the future to work with other maskable instructions, but this is a good initial improvement.

Fixes #113400
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   9 +-
 .../X86/avx512-shuffles/partial_permute.ll    | 327 ++++++++++--------
 2 files changed, 194 insertions(+), 142 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c15517249eb819..c0938b8b08f00d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -46835,9 +46835,9 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
   }
 
-  // AVX512 - Extend select with zero to merge with target shuffle.
-  // select(mask, extract_subvector(shuffle(x)), zero) -->
-  // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
+  // AVX512 - Extend select to merge with target shuffle.
+  // select(mask, extract_subvector(shuffle(x)), y) -->
+  // extract_subvector(select(widen(mask), shuffle(x), widen(y)))
   // TODO - support non target shuffles as well with canCombineAsMaskOperation.
   if (Subtarget.hasAVX512() && CondVT.isVector() &&
       CondVT.getVectorElementType() == MVT::i1) {
@@ -46847,7 +46847,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
              isNullConstant(Op.getOperand(1)) &&
              TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
              Op.hasOneUse() && Op.getOperand(0).hasOneUse() &&
-             ISD::isBuildVectorAllZeros(Alt.getNode());
+             (Op.getOperand(0).getOpcode() != X86ISD::VPERMV3 ||
+              ISD::isBuildVectorAllZeros(Alt.getNode()));
     };
 
     bool SelectableLHS = SelectableOp(LHS, RHS);
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 1b9e9b200a9e3c..ffbeeb19a4aebd 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -18,10 +18,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [8,6,12,4,7,9,14,8]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8>
@@ -47,10 +48,11 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [4,12,9,4,14,15,12,14]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
@@ -76,10 +78,11 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [4,11,14,10,7,1,6,9]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9>
@@ -116,10 +119,11 @@ define <8 x i16> @test_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec) {
 define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask3(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [14,15,7,13,4,12,8,0]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
@@ -403,10 +407,11 @@ define <16 x i16> @test_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec) {
 define <16 x i16> @test_masked_32xi16_to_16xi16_perm_mask3(<32 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_16xi16_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} ymm3 = [1,0,30,5,3,6,25,29,0,13,3,8,7,20,11,5]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 1, i32 0, i32 30, i32 5, i32 3, i32 6, i32 25, i32 29, i32 0, i32 13, i32 3, i32 8, i32 7, i32 20, i32 11, i32 5>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -473,10 +478,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask0(<32 x i16> %vec, <8 x
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [1,21,27,10,8,19,14,5]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5>
@@ -502,10 +508,11 @@ define <8 x i16> @test_masked_z_32xi16_to_8xi16_perm_mask1(<32 x i16> %vec, <8 x
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask2(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [15,13,18,16,9,11,26,8]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8>
@@ -542,10 +549,11 @@ define <8 x i16> @test_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec) {
 define <8 x i16> @test_masked_32xi16_to_8xi16_perm_mask3(<32 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
 ; CHECK-LABEL: test_masked_32xi16_to_8xi16_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbw {{.*#+}} xmm3 = [17,0,23,10,1,8,7,30]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmw %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 17, i32 0, i32 23, i32 10, i32 1, i32 8, i32 7, i32 30>
@@ -906,10 +914,11 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec) {
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [4,0,3,2]
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 4, i32 0, i32 3, i32 2>
@@ -935,10 +944,11 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask0(<8 x i32> %vec, <4 x i
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,0,7,3]
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 3, i32 0, i32 7, i32 3>
@@ -1002,10 +1012,11 @@ define <4 x i32> @test_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec) {
 define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask3(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,3,2,5]
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 5, i32 3, i32 2, i32 5>
@@ -1188,10 +1199,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec) {
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [1,13,11,14,7,10,1,6]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 1, i32 13, i32 11, i32 14, i32 7, i32 10, i32 1, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1215,10 +1227,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask0(<16 x i32> %vec, <8 x
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [3,0,15,3,2,3,6,8]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 3, i32 0, i32 15, i32 3, i32 2, i32 3, i32 6, i32 8>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1242,10 +1255,11 @@ define <8 x i32> @test_masked_z_16xi32_to_8xi32_perm_mask1(<16 x i32> %vec, <8 x
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask2(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [2,15,15,2,6,10,14,7]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1279,10 +1293,11 @@ define <8 x i32> @test_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec) {
 define <8 x i32> @test_masked_16xi32_to_8xi32_perm_mask3(<16 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_8xi32_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [14,5,7,7,10,3,9,3]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmd %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 14, i32 5, i32 7, i32 7, i32 10, i32 3, i32 9, i32 3>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1317,10 +1332,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} ymm3 = [0,2,4,12,4,6,4,12]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 12>
@@ -1346,11 +1362,12 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4]
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
@@ -1377,10 +1394,11 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask2(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [1,1,13,0]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 1, i32 1, i32 13, i32 0>
@@ -1417,10 +1435,11 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec) {
 define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask3(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
 ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbd {{.*#+}} xmm3 = [3,0,0,13]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 13>
@@ -1782,9 +1801,10 @@ define <2 x i64> @test_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec) {
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,0,2,3]
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 0>
@@ -1809,9 +1829,10 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mask0(<4 x i64> %vec, <2 x i
 define <2 x i64> @test_masked_4xi64_to_2xi64_perm_mask1(<4 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_4xi64_to_2xi64_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <2 x i32> <i32 2, i32 1>
@@ -1939,10 +1960,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
@@ -1983,10 +2005,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
@@ -2044,10 +2067,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [6,0,0,7]
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
@@ -2088,10 +2112,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5]
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
@@ -2132,10 +2157,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask5:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [4,1,0,6]
-; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 4, i32 1, i32 0, i32 6>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2176,10 +2202,11 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3]
-; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-FAST-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
@@ -2221,10 +2248,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i
 define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vpmovsxbq {{.*#+}} ymm3 = [2,0,3,4]
-; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm0
 ; CHECK-NEXT:    vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2258,9 +2286,10 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,0,2,3,7,4,6,7]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2285,10 +2314,11 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
 ; CHECK-NEXT:    vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
@@ -2699,10 +2729,11 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
 define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; CHECK-FAST-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [4,1]
-; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm2
 ; CHECK-FAST-NEXT:    vptestnmq %xmm1, %xmm1, %k1
-; CHECK-FAST-NEXT:    vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-FAST-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; CHECK-FAST-NEXT:    vzeroupper
 ; CHECK-FAST-NEXT:    retq
 ;
@@ -2815,11 +2846,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [1,3,5,0]
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
@@ -2846,11 +2878,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,2,7,0]
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
@@ -2888,11 +2921,12 @@ define <4 x float> @test_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec) {
 define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask3(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [3,3,5,2]
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 3, i32 5, i32 2>
@@ -3087,11 +3121,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec) {
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask0(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,12,10,8,2,11,7]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 12, i32 10, i32 8, i32 2, i32 11, i32 7>
   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3116,11 +3151,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask0(<16 x float> %v
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask1(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [10,12,3,12,4,15,1,14]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14>
   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3145,11 +3181,12 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mask1(<16 x float> %v
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask2(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask2:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,8,9,6,1,4,4]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4>
   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3184,11 +3221,12 @@ define <8 x float> @test_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec) {
 define <8 x float> @test_masked_16xfloat_to_8xfloat_perm_mask3(<16 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_8xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [12,14,9,0,12,4,5,8]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <8 x i32> <i32 12, i32 14, i32 9, i32 0, i32 12, i32 4, i32 5, i32 8>
   %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
@@ -3224,11 +3262,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask0(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [4,8,9,10]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 4, i32 8, i32 9, i32 10>
@@ -3255,11 +3294,12 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %v
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-FAST-NEXT:    vmovaps {{.*#+}} xmm3 = [8,6,10,6]
-; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm3, %zmm0
-; CHECK-FAST-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-FAST-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-FAST-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-FAST-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-FAST-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-FAST-NEXT:    vzeroupper
 ; CHECK-FAST-NEXT:    retq
 ;
@@ -3352,11 +3392,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) {
 define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
 ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm3 = [10,2,11,6]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 10, i32 2, i32 11, i32 6>
@@ -3746,10 +3787,11 @@ define <2 x double> @test_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec) {
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask0(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,2,3]
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,3]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -3775,10 +3817,11 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mask0(<4 x double>
 define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mask1(<4 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3]
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[1,3,2,3]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 1, i32 3>
@@ -3887,12 +3930,13 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) {
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [7,3,7,3]
 ; CHECK-NEXT:    # ymm3 = mem[0,1,0,1]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -3918,11 +3962,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double>
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask1(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,0,7,6]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 0, i32 7, i32 6>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -3983,11 +4028,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec) {
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask3(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask3:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [0,2,1,4]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 4>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4061,11 +4107,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double>
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} ymm3 = [2,6,2,2]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 2, i32 6, i32 2, i32 2>
   %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4107,11 +4154,12 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [5,0,7,0]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-FAST-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
@@ -4155,11 +4203,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double>
 define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
 ; CHECK-FAST-NEXT:    vmovapd {{.*#+}} ymm3 = [3,5,0,6]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-FAST-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-FAST-NEXT:    vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-FAST-NEXT:    retq
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
@@ -4221,11 +4270,12 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
 ; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-FAST-NEXT:    vmovapd {{.*#+}} xmm3 = [0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0]
-; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-FAST-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-FAST-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-FAST-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-FAST-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-FAST-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-FAST-NEXT:    vzeroupper
 ; CHECK-FAST-NEXT:    retq
 ;
@@ -4271,11 +4321,12 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double>
 define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask1(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
 ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask1:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; CHECK-NEXT:    vmovapd {{.*#+}} xmm3 = [3,7]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
-; CHECK-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT:    vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 3, i32 7>



More information about the llvm-commits mailing list