[llvm] r351198 - [DAGCombiner] reduce buildvec of zexted extracted element to shuffle

Sanjay Patel via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 15 08:11:05 PST 2019


Author: spatel
Date: Tue Jan 15 08:11:05 2019
New Revision: 351198

URL: http://llvm.org/viewvc/llvm-project?rev=351198&view=rev
Log:
[DAGCombiner] reduce buildvec of zexted extracted element to shuffle

The motivating case for this is shown in the first regression test. We are 
transferring to scalar and back rather than just zero-extending with 'vpmovzxdq'.

That's a special-case for a more general pattern as shown here. In all tests, 
we're avoiding the vector-scalar-vector moves in favor of vector ops.

We aren't producing optimal shuffle code in some cases though, so the patch is
limited to reduce regressions.

Differential Revision: https://reviews.llvm.org/D56281

Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/trunk/test/CodeGen/X86/buildvec-extract.ll

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=351198&r1=351197&r2=351198&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Tue Jan 15 08:11:05 2019
@@ -16195,6 +16195,78 @@ SDValue DAGCombiner::createBuildVecShuff
   return Shuffle;
 }
 
+static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
+  assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
+
+  // First, determine where the build vector is not undef.
+  // TODO: We could extend this to handle zero elements as well as undefs.
+  int NumBVOps = BV->getNumOperands();
+  int ZextElt = -1;
+  for (int i = 0; i != NumBVOps; ++i) {
+    SDValue Op = BV->getOperand(i);
+    if (Op.isUndef())
+      continue;
+    if (ZextElt == -1)
+      ZextElt = i;
+    else
+      return SDValue();
+  }
+  // Bail out if there's no non-undef element.
+  if (ZextElt == -1)
+    return SDValue();
+
+  // The build vector contains some number of undef elements and exactly
+  // one other element. That other element must be a zero-extended scalar
+  // extracted from a vector at a constant index to turn this into a shuffle.
+  // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND.
+  SDValue Zext = BV->getOperand(ZextElt);
+  if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() ||
+      Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)))
+    return SDValue();
+
+  // The zero-extend must be a multiple of the source size.
+  SDValue Extract = Zext.getOperand(0);
+  unsigned DestSize = Zext.getValueSizeInBits();
+  unsigned SrcSize = Extract.getValueSizeInBits();
+  if (DestSize % SrcSize != 0)
+    return SDValue();
+
+  // Create a shuffle mask that will combine the extracted element with zeros
+  // and undefs.
+  int ZextRatio =  DestSize / SrcSize;
+  int NumMaskElts = NumBVOps * ZextRatio;
+  SmallVector<int, 32> ShufMask(NumMaskElts, -1);
+  for (int i = 0; i != NumMaskElts; ++i) {
+    if (i / ZextRatio == ZextElt) {
+      // The low bits of the (potentially translated) extracted element map to
+      // the source vector. The high bits map to zero. We will use a zero vector
+      // as the 2nd source operand of the shuffle, so use the 1st element of
+      // that vector (mask value is number-of-elements) for the high bits.
+      if (i % ZextRatio == 0)
+        ShufMask[i] = Extract.getConstantOperandVal(1);
+      else
+        ShufMask[i] = NumMaskElts;
+    }
+
+    // Undef elements of the build vector remain undef because we initialize
+    // the shuffle mask with -1.
+  }
+
+  // Turn this into a shuffle with zero if that's legal.
+  EVT VecVT = Extract.getOperand(0).getValueType();
+  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(ShufMask, VecVT))
+    return SDValue();
+
+  // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
+  // bitcast (shuffle V, ZeroVec, VectorMask)
+  SDLoc DL(BV);
+  SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
+  SDValue Shuf = DAG.getVectorShuffle(VecVT, DL, Extract.getOperand(0), ZeroVec,
+                                      ShufMask);
+  return DAG.getBitcast(BV->getValueType(0), Shuf);
+}
+
 // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
 // operations. If the types of the vectors we're extracting from allow it,
 // turn this into a vector_shuffle node.
@@ -16206,6 +16278,9 @@ SDValue DAGCombiner::reduceBuildVecToShu
   if (!isTypeLegal(VT))
     return SDValue();
 
+  if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG))
+    return V;
+
   // May only combine to shuffle after legalize if shuffle is legal.
   if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
     return SDValue();

Modified: llvm/trunk/test/CodeGen/X86/buildvec-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/buildvec-extract.ll?rev=351198&r1=351197&r2=351198&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/buildvec-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/buildvec-extract.ll Tue Jan 15 08:11:05 2019
@@ -4,16 +4,20 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx    | FileCheck %s --check-prefixes=ANY,AVX
 
 define <2 x i64> @extract0_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE-LABEL: extract0_i32_zext_insert0_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract0_i32_zext_insert0_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract0_i32_zext_insert0_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract0_i32_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 0
   %z = zext i32 %e to i64
@@ -40,23 +44,14 @@ define <2 x i64> @extract0_i32_zext_inse
 }
 
 define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract1_i32_zext_insert0_i64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movq %rax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract1_i32_zext_insert0_i64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $1, %xmm0, %eax
-; SSE41-NEXT:    movq %rax, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: extract1_i32_zext_insert0_i64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract1_i32_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vextractps $1, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 1
   %z = zext i32 %e to i64
@@ -90,23 +85,16 @@ define <2 x i64> @extract1_i32_zext_inse
 }
 
 define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract2_i32_zext_insert0_i64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movq %rax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract2_i32_zext_insert0_i64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $2, %xmm0, %eax
-; SSE41-NEXT:    movq %rax, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: extract2_i32_zext_insert0_i64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract2_i32_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vextractps $2, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 2
   %z = zext i32 %e to i64
@@ -140,23 +128,14 @@ define <2 x i64> @extract2_i32_zext_inse
 }
 
 define <2 x i64> @extract3_i32_zext_insert0_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract3_i32_zext_insert0_i64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movq %rax, %xmm0
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract3_i32_zext_insert0_i64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $3, %xmm0, %eax
-; SSE41-NEXT:    movq %rax, %xmm0
-; SSE41-NEXT:    retq
+; SSE-LABEL: extract3_i32_zext_insert0_i64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract3_i32_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vextractps $3, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 3
   %z = zext i32 %e to i64
@@ -190,18 +169,25 @@ define <2 x i64> @extract3_i32_zext_inse
 }
 
 define <2 x i64> @extract0_i32_zext_insert1_i64_undef(<4 x i32> %x) {
-; SSE-LABEL: extract0_i32_zext_insert1_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract0_i32_zext_insert1_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract0_i32_zext_insert1_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract0_i32_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 0
   %z = zext i32 %e to i64
@@ -232,24 +218,18 @@ define <2 x i64> @extract0_i32_zext_inse
 define <2 x i64> @extract1_i32_zext_insert1_i64_undef(<4 x i32> %x) {
 ; SSE2-LABEL: extract1_i32_zext_insert1_i64_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movq %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extract1_i32_zext_insert1_i64_undef:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $1, %xmm0, %eax
-; SSE41-NEXT:    movq %rax, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract1_i32_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vextractps $1, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 1
   %z = zext i32 %e to i64
@@ -288,24 +268,19 @@ define <2 x i64> @extract1_i32_zext_inse
 define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) {
 ; SSE2-LABEL: extract2_i32_zext_insert1_i64_undef:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movq %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extract2_i32_zext_insert1_i64_undef:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $2, %xmm0, %eax
-; SSE41-NEXT:    movq %rax, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract2_i32_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vextractps $2, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 2
   %z = zext i32 %e to i64
@@ -342,26 +317,14 @@ define <2 x i64> @extract2_i32_zext_inse
 }
 
 define <2 x i64> @extract3_i32_zext_insert1_i64_undef(<4 x i32> %x) {
-; SSE2-LABEL: extract3_i32_zext_insert1_i64_undef:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movq %rax, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract3_i32_zext_insert1_i64_undef:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $3, %xmm0, %eax
-; SSE41-NEXT:    movq %rax, %xmm0
-; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE41-NEXT:    retq
+; SSE-LABEL: extract3_i32_zext_insert1_i64_undef:
+; SSE:       # %bb.0:
+; SSE-NEXT:    psrlq $32, %xmm0
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract3_i32_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vextractps $3, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 3
   %z = zext i32 %e to i64
@@ -398,16 +361,21 @@ define <2 x i64> @extract3_i32_zext_inse
 }
 
 define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract0_i16_zext_insert0_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract0_i16_zext_insert0_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract0_i16_zext_insert0_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract0_i16_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 0
   %z = zext i16 %e to i64
@@ -434,16 +402,25 @@ define <2 x i64> @extract0_i16_zext_inse
 }
 
 define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract1_i16_zext_insert0_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $1, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract1_i16_zext_insert0_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,0,1,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract1_i16_zext_insert0_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract1_i16_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 1
   %z = zext i16 %e to i64
@@ -470,16 +447,26 @@ define <2 x i64> @extract1_i16_zext_inse
 }
 
 define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract2_i16_zext_insert0_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract2_i16_zext_insert0_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract2_i16_zext_insert0_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract2_i16_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 2
   %z = zext i16 %e to i64
@@ -508,14 +495,12 @@ define <2 x i64> @extract2_i16_zext_inse
 define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) {
 ; SSE-LABEL: extract3_i16_zext_insert0_i64_undef:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $3, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    psrlq $48, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: extract3_i16_zext_insert0_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 3
   %z = zext i16 %e to i64
@@ -542,18 +527,24 @@ define <2 x i64> @extract3_i16_zext_inse
 }
 
 define <2 x i64> @extract0_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract0_i16_zext_insert1_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract0_i16_zext_insert1_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract0_i16_zext_insert1_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract0_i16_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 0
   %z = zext i16 %e to i64
@@ -582,18 +573,21 @@ define <2 x i64> @extract0_i16_zext_inse
 }
 
 define <2 x i64> @extract1_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract1_i16_zext_insert1_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $1, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract1_i16_zext_insert1_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract1_i16_zext_insert1_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract1_i16_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $1, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 1
   %z = zext i16 %e to i64
@@ -622,18 +616,24 @@ define <2 x i64> @extract1_i16_zext_inse
 }
 
 define <2 x i64> @extract2_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract2_i16_zext_insert1_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $2, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract2_i16_zext_insert1_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract2_i16_zext_insert1_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract2_i16_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $2, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 2
   %z = zext i16 %e to i64
@@ -662,18 +662,24 @@ define <2 x i64> @extract2_i16_zext_inse
 }
 
 define <2 x i64> @extract3_i16_zext_insert1_i64_undef(<8 x i16> %x) {
-; SSE-LABEL: extract3_i16_zext_insert1_i64_undef:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $3, %xmm0, %eax
-; SSE-NEXT:    movq %rax, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract3_i16_zext_insert1_i64_undef:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract3_i16_zext_insert1_i64_undef:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract3_i16_zext_insert1_i64_undef:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $3, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7]
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 3
   %z = zext i16 %e to i64




More information about the llvm-commits mailing list