[llvm] r291584 - [X86][AVX512]Improving shuffle lowering by using AVX-512 EXPAND* instructions

Tue Jan 10 10:57:18 PST 2017

Author: mzuckerm
Date: Tue Jan 10 12:57:17 2017
New Revision: 291584

URL: http://llvm.org/viewvc/llvm-project?rev=291584&view=rev
Log:
[X86][AVX512]Improving shuffle lowering by using AVX-512 EXPAND* instructions 

This patch fix PR31351: https://llvm.org/bugs/show_bug.cgi?id=31351

1.  This patch adds new type of shuffle lowering
2.  We can use the expand instruction, When the shuffle pattern is as following:
    { 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order}.

Reviewers: 1. igorb  
           2. guyblank  
           3. craig.topper  
           4. RKSimon 

Differential Revision: https://reviews.llvm.org/D28352


Added:
    llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=291584&r1=291583&r2=291584&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Tue Jan 10 12:57:17 2017
@@ -8090,6 +8090,37 @@ static SmallBitVector computeZeroableShu
   return Zeroable;
 }
 
+// The Shuffle result is as follow:
+// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
+// Each Zeroable's element correspond to a particular Mask's element.
+// As described in computeZeroableShuffleElements function.
+//
+// The function looks for a sub-mask that the nonzero elements are in
+// increasing order. If such sub-mask exist. The function returns true.
+static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
+                                     ArrayRef<int> Mask,const EVT &VectorType,
+                                     bool &IsZeroSideLeft) {
+  int NextElement = -1;
+  // Check if the Mask's nonzero elements are in increasing order.
+  for (int i = 0, e = Zeroable.size(); i < e; i++) {
+    // Checks if the mask's zeros elements are built from only zeros.
+    if (Mask[i] == -1)
+      return false;
+    if (Zeroable[i])
+      continue;
+    // Find the lowest non zero element
+    if (NextElement == -1) {
+      NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
+      IsZeroSideLeft = NextElement != 0;
+    }
+    // Exit if the mask's non zero elements are not in increasing order.
+    if (NextElement != Mask[i])
+      return false;
+    NextElement++;
+  }
+  return true;
+}
+
 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
@@ -8145,6 +8176,46 @@ static SDValue lowerVectorShuffleWithPSH
                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
 }
 
+static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
+                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
+                           const SDLoc &dl);
+
+// Function convertBitVectorToUnsigned - The function gets SmallBitVector
+// as argument and convert him to unsigned.
+// The output of the function is not(zeroable)
+static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
+  unsigned convertBit = 0;
+  for (int i = 0, e = Zeroable.size(); i < e; i++)
+    convertBit |= !(Zeroable[i]) << i;
+  return convertBit;
+}
+
+// X86 has dedicated shuffle that can be lowered to VEXPAND
+static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
+                                          const SmallBitVector &Zeroable,
+                                          ArrayRef<int> Mask, SDValue &V1,
+                                          SDValue &V2, SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget) {
+  bool IsLeftZeroSide = true;
+  if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
+                                IsLeftZeroSide))
+    return SDValue();
+  unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+  MVT IntegerType =
+      MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
+  SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
+  unsigned NumElts = VT.getVectorNumElements();
+  assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
+         "Unexpected number of vector elements");
+  SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
+                              Subtarget, DAG, DL);
+  SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
+  SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
+  return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
+                     DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
+                     ZeroVector);
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -12159,6 +12230,11 @@ static SDValue lowerV4F64VectorShuffle(c
     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
       return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
   // can fully permute the elements.
@@ -12222,12 +12298,17 @@ static SDValue lowerV4I64VectorShuffle(c
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or VEXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use PALIGNR.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
                                                       Mask, Subtarget, DAG))
@@ -12328,6 +12409,11 @@ static SDValue lowerV8F32VectorShuffle(c
   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
     return Result;
+  // If we have VLX support, we can use VEXPAND.
+  if (Subtarget.hasVLX())
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
 
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
@@ -12392,12 +12478,17 @@ static SDValue lowerV8I32VectorShuffle(c
                                                 Zeroable, Subtarget, DAG))
     return Shift;
 
-  // If we have VLX support, we can use VALIGN.
-  if (Subtarget.hasVLX())
+  // If we have VLX support, we can use VALIGN or EXPAND.
+  if (Subtarget.hasVLX()) {
     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
                                                     Mask, Subtarget, DAG))
       return Rotate;
 
+    if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
+                                               V1, V2, DAG, Subtarget))
+      return V;
+  }
+
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
@@ -12754,6 +12845,7 @@ static SDValue lowerV4X128VectorShuffle(
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                       const SmallBitVector &Zeroable, 
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12796,11 +12888,16 @@ static SDValue lowerV8F64VectorShuffle(c
       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
     return Op;
 
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
+
   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
 }
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
+                                        const SmallBitVector &Zeroable, 
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12832,6 +12929,10 @@ static SDValue lowerV16F32VectorShuffle(
     // Otherwise, fall back to a SHUFPS sequence.
     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
 }
@@ -12889,6 +12990,10 @@ static SDValue lowerV8I64VectorShuffle(c
   if (SDValue Unpck =
           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
     return Unpck;
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
+                                             V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
 }
@@ -12953,6 +13058,10 @@ static SDValue lowerV16I32VectorShuffle(
                                                   CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v16i32, ShufPS);
   }
+  // If we have AVX512F support, we can use VEXPAND.
+  if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
+                                             V1, V2, DAG, Subtarget))
+    return V;
 
   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
@@ -13089,9 +13198,9 @@ static SDValue lower512BitVectorShuffle(
   // the requisite ISA extensions for that element type are available.
   switch (VT.SimpleTy) {
   case MVT::v8f64:
-    return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16f32:
-    return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
+    return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i64:
     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i32:

Added: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll?rev=291584&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll (added)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll Tue Jan 10 12:57:17 2017
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=knl | FileCheck %s --check-prefix=KNL
+
+;expand 128 -> 256 include <4 x float> <2 x double>
+define <8 x float> @expand(<4 x float> %a) {
+; SKX-LABEL: expand:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $5, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 5, i32 1, i32 5, i32 5, i32 5, i32 5, i32 5>
+   ret <8 x float> %res
+}
+
+define <8 x float> @expand1(<4 x float> %a ) {
+; SKX-LABEL: expand1:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-86, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand1:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,0,u,1,u,2,u,3>
+; KNL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+   ret <8 x float> %res
+}
+
+;Expand 128 -> 256 test <2 x double> -> <4 x double>
+define <4 x double> @expand2(<2 x double> %a) {
+; SKX-LABEL: expand2:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand2:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 2, i32 1>
+   ret <4 x double> %res
+}
+
+;expand 128 -> 256 include case <4 x i32> <8 x i32>
+define <8 x i32> @expand3(<4 x i32> %a ) {
+; SKX-LABEL: expand3:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand3:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpbroadcastq %xmm0, %ymm0
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <8 x i32> <i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0,i32 5>
+   ret <8 x i32> %res
+}
+
+;expand 128 -> 256 include case <2 x i64> <4 x i64>
+define <4 x i64> @expand4(<2 x i64> %a ) {
+; SKX-LABEL: expand4:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $9, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand4:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
+; KNL-NEXT:    retq
+   %res = shufflevector <2 x i64> zeroinitializer, <2 x i64> %a, <4 x i32> <i32 2, i32 0, i32 0, i32 3>
+   ret <4 x i64> %res
+}
+
+;Negative test for 128-> 256
+define <8 x float> @expand5(<4 x float> %a ) {
+; SKX-LABEL: expand5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vbroadcastss %xmm0, %ymm0
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand5:
+; KNL:       # BB#0:
+; KNL-NEXT:    vbroadcastss %xmm0, %ymm0
+; KNL-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
+   ret <8 x float> %res
+}
+
+;expand 256 -> 512 include <8 x float> <16 x float>
+define <8 x float> @expand6(<4 x float> %a ) {
+; SKX-LABEL: expand6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vinsertf{{.*}}$1, %xmm0, %ymm1, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand6:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; KNL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   ret <8 x float> %res
+}
+
+define <16 x float> @expand7(<8 x float> %a) {
+; SKX-LABEL: expand7:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $1285, %ax # imm = 0x505
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand7:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $1285, %ax # imm = 0x505
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 8, i32 8, i32 8, i32 8, i32 2, i32 8, i32 3, i32 8, i32 8, i32 8, i32 8, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand8(<8 x float> %a ) {
+; SKX-LABEL: expand8:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand8:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x float> %res
+}
+
+;expand 256 -> 512 include <4 x double> <8 x double>
+define <8 x double> @expand9(<4 x double> %a) {
+; SKX-LABEL: expand9:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand9:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x double> %res
+}
+
+define <16 x i32> @expand10(<8 x i32> %a ) {
+; SKX-LABEL: expand10:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand10:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x i32> zeroinitializer, <8 x i32> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+   ret <16 x i32> %res
+}
+
+define <8 x i64> @expand11(<4 x i64> %a) {
+; SKX-LABEL: expand11:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    movb $-127, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand11:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    movb $-127, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+   %res = shufflevector <4 x i64> %a, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 1>
+   ret <8 x i64> %res
+}
+
+;Negative test for 256-> 512
+define <16 x float> @expand12(<8 x float> %a) {
+; SKX-LABEL: expand12:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; SKX-NEXT:    vxorps %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand12:
+; KNL:       # BB#0:
+; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vmovaps {{.*#+}} zmm2 = [0,16,2,16,4,16,6,16,0,16,1,16,2,16,3,16]
+; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vpermt2ps %zmm0, %zmm2, %zmm1
+; KNL-NEXT:    vmovaps %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8,i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+   ret <16 x float> %res
+}
+
+define <16 x float> @expand13(<8 x float> %a ) {
+; SKX-LABEL: expand13:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; SKX-NEXT:    vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand13:
+; KNL:       # BB#0:
+; KNL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; KNL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; KNL-NEXT:    retq
+   %res = shufflevector <8 x float> zeroinitializer, <8 x float> %a, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   ret <16 x float> %res
+}
+
+; The function checks for a case where the vector is mixed values vector ,and the mask points on zero elements from this vector.
+
+define <8 x float> @expand14(<4 x float> %a) {
+; SKX-LABEL: expand14:
+; SKX:       # BB#0:
+; SKX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; SKX-NEXT:    movb $20, %al
+; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand14:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 3, i32 3, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}
+
+;Negative test.
+define <8 x float> @expand15(<4 x float> %a) {
+; SKX-LABEL: expand15:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
+; SKX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
+; SKX-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: expand15:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; KNL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
+; KNL-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
+; KNL-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
+; KNL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
+; KNL-NEXT:    retq
+   %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
+   %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> <i32 0, i32 1, i32 4, i32 0, i32 5, i32 0, i32 0, i32 0>
+   ret <8 x float> %res
+}