[llvm] acbc5ed - [X86][SSE] getFauxShuffle - support insert(truncate/extend(extract(vec0,c0)),vec1,c1) shuffle patterns at the byte level

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 26 07:31:29 PDT 2020


Author: Simon Pilgrim
Date: 2020-04-26T15:31:01+01:00
New Revision: acbc5ede9916a22b06341647d94e5dff51af32a2

URL: https://github.com/llvm/llvm-project/commit/acbc5ede9916a22b06341647d94e5dff51af32a2
DIFF: https://github.com/llvm/llvm-project/commit/acbc5ede9916a22b06341647d94e5dff51af32a2.diff

LOG: [X86][SSE] getFauxShuffle - support insert(truncate/extend(extract(vec0,c0)),vec1,c1) shuffle patterns at the byte level

Followup to the PR45604 fix at rGe71dd7c011a3 where we disabled most of these cases.

By creating the shuffle at the byte level we can handle any extension/truncation as long as we track how small the scalar got and assume that the upper bytes will need to be zero.

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/buildvec-extract.ll
    llvm/test/CodeGen/X86/buildvec-insertvec.ll
    llvm/test/CodeGen/X86/extract-concat.ll
    llvm/test/CodeGen/X86/vector-shuffle-combining.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6aa42fba4eb0..155df1577f45 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7462,16 +7462,18 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     }
 
     // Peek through trunc/aext/zext.
-    // TODO: handle elements smaller than VT.
     // TODO: aext shouldn't require SM_SentinelZero padding.
     // TODO: handle shift of scalars.
+    unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
     while (Scl.getOpcode() == ISD::TRUNCATE ||
            Scl.getOpcode() == ISD::ANY_EXTEND ||
            Scl.getOpcode() == ISD::ZERO_EXTEND) {
       Scl = Scl.getOperand(0);
-      if (Scl.getScalarValueSizeInBits() < NumBitsPerElt)
-        return false;
+      if (MinBitsPerElt > Scl.getScalarValueSizeInBits())
+        MinBitsPerElt = Scl.getScalarValueSizeInBits();
     }
+    if ((MinBitsPerElt % 8) != 0)
+      return false;
 
     // Attempt to find the source vector the scalar was extracted from.
     SDValue SrcExtract;
@@ -7486,31 +7488,29 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
 
     SDValue SrcVec = SrcExtract.getOperand(0);
     EVT SrcVT = SrcVec.getValueType();
-    unsigned NumSrcElts = SrcVT.getVectorNumElements();
-    unsigned NumZeros =
-        std::max<int>((NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1, 0);
-
-    if ((NumSrcElts % NumElts) != 0)
+    if (!SrcVT.getScalarType().isByteSized())
       return false;
-
     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
-    if (NumSrcElts <= SrcIdx)
-      return false;
+    unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
+    unsigned DstByte = DstIdx * NumBytesPerElt;
 
+    // Create 'identity' byte level shuffle mask and then add inserted bytes.
     if (Opcode == ISD::SCALAR_TO_VECTOR) {
       Ops.push_back(SrcVec);
-      Mask.append(NumSrcElts, SM_SentinelUndef);
+      Mask.append(NumSizeInBytes, SM_SentinelUndef);
     } else {
       Ops.push_back(SrcVec);
       Ops.push_back(N.getOperand(0));
-      for (int i = 0; i != (int)NumSrcElts; ++i)
-        Mask.push_back(NumSrcElts + i);
+      for (int i = 0; i != (int)NumSizeInBytes; ++i)
+        Mask.push_back(NumSizeInBytes + i);
     }
 
-    int Scale = NumSrcElts / NumElts;
-    Mask[Scale * DstIdx] = SrcIdx;
-    for (int i = 0; i != (int)NumZeros; ++i)
-      Mask[(Scale * DstIdx) + i + 1] = SM_SentinelZero;
+    unsigned MinBytesPerElts = MinBitsPerElt / 8;
+    MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
+    for (unsigned i = 0; i != MinBytesPerElts; ++i)
+      Mask[DstByte + i] = SrcByte + i;
+    for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
+      Mask[DstByte + i] = SM_SentinelZero;
     return true;
   }
   case X86ISD::PACKSS:

diff  --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index 218701250e43..863ab4dee123 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -293,24 +293,19 @@ define <2 x i64> @extract2_i32_zext_insert1_i64_undef(<4 x i32> %x) {
 define <2 x i64> @extract2_i32_zext_insert1_i64_zero(<4 x i32> %x) {
 ; SSE2-LABEL: extract2_i32_zext_insert1_i64_zero:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movq %rax, %xmm0
-; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extract2_i32_zext_insert1_i64_zero:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    extractps $2, %xmm0, %eax
-; SSE41-NEXT:    movq %rax, %xmm0
-; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; SSE41-NEXT:    xorps %xmm1, %xmm1
+; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract2_i32_zext_insert1_i64_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vextractps $2, %xmm0, %eax
-; AVX-NEXT:    vmovq %rax, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
 ; AVX-NEXT:    retq
   %e = extractelement <4 x i32> %x, i32 2
   %z = zext i32 %e to i64
@@ -386,16 +381,22 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
 }
 
 define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
-; SSE-LABEL: extract0_i16_zext_insert0_i64_zero:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pextrw $0, %xmm0, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: extract0_i16_zext_insert0_i64_zero:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpextrw $0, %xmm0, %eax
-; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; AVX-NEXT:    retq
   %e = extractelement <8 x i16> %x, i32 0
   %z = zext i16 %e to i64

diff  --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 3add65914b58..9fb78491b608 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -21,10 +21,7 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
 ; SSE41-LABEL: foo:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE41-NEXT:    pextrb $8, %xmm0, %eax
-; SSE41-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE41-NEXT:    pinsrb $1, %ecx, %xmm0
-; SSE41-NEXT:    pinsrb $2, %eax, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    movl $255, %eax
 ; SSE41-NEXT:    pinsrb $3, %eax, %xmm0
 ; SSE41-NEXT:    movd %xmm0, (%rdi)

diff  --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
index 085560c1a504..26e07d86bfc3 100644
--- a/llvm/test/CodeGen/X86/extract-concat.ll
+++ b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -24,10 +24,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
 ; SSE42-LABEL: foo:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE42-NEXT:    pextrb $8, %xmm0, %eax
-; SSE42-NEXT:    pextrb $4, %xmm0, %ecx
-; SSE42-NEXT:    pinsrb $1, %ecx, %xmm0
-; SSE42-NEXT:    pinsrb $2, %eax, %xmm0
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    movl $255, %eax
 ; SSE42-NEXT:    pinsrb $3, %eax, %xmm0
 ; SSE42-NEXT:    movd %xmm0, (%rdi)
@@ -36,10 +33,7 @@ define void @foo(<4 x float> %in, <4 x i8>* %out) {
 ; AVX-LABEL: foo:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX-NEXT:    vpextrb $4, %xmm0, %ecx
-; AVX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    movl $255, %eax
 ; AVX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, (%rdi)

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index c80ff1e8ee33..6b42178c6719 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3028,40 +3028,109 @@ define void @PR43024() {
 }
 
 define void @PR45604(<32 x i16>* %dst, <8 x i16>* %src) {
-; SSE-LABEL: PR45604:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rsi), %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    movzwl %ax, %eax
-; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movl $11, %eax
-; SSE-NEXT:    pinsrw $2, %eax, %xmm0
-; SSE-NEXT:    pextrw $1, %xmm1, %ecx
-; SSE-NEXT:    pinsrw $4, %ecx, %xmm0
-; SSE-NEXT:    pinsrw $6, %eax, %xmm0
-; SSE-NEXT:    pextrw $2, %xmm1, %ecx
-; SSE-NEXT:    movd %ecx, %xmm2
-; SSE-NEXT:    pinsrw $2, %eax, %xmm2
-; SSE-NEXT:    pextrw $3, %xmm1, %ecx
-; SSE-NEXT:    pinsrw $4, %ecx, %xmm2
-; SSE-NEXT:    pinsrw $6, %eax, %xmm2
-; SSE-NEXT:    pextrw $4, %xmm1, %ecx
-; SSE-NEXT:    movd %ecx, %xmm3
-; SSE-NEXT:    pinsrw $2, %eax, %xmm3
-; SSE-NEXT:    pextrw $5, %xmm1, %ecx
-; SSE-NEXT:    pinsrw $4, %ecx, %xmm3
-; SSE-NEXT:    pinsrw $6, %eax, %xmm3
-; SSE-NEXT:    pextrw $6, %xmm1, %ecx
-; SSE-NEXT:    movd %ecx, %xmm4
-; SSE-NEXT:    pinsrw $2, %eax, %xmm4
-; SSE-NEXT:    pextrw $7, %xmm1, %ecx
-; SSE-NEXT:    pinsrw $4, %ecx, %xmm4
-; SSE-NEXT:    pinsrw $6, %eax, %xmm4
-; SSE-NEXT:    movdqa %xmm4, 48(%rdi)
-; SSE-NEXT:    movdqa %xmm3, 32(%rdi)
-; SSE-NEXT:    movdqa %xmm2, 16(%rdi)
-; SSE-NEXT:    movdqa %xmm0, (%rdi)
-; SSE-NEXT:    retq
+; SSE2-LABEL: PR45604:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa (%rsi), %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    movzwl %ax, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movl $11, %eax
+; SSE2-NEXT:    pinsrw $2, %eax, %xmm0
+; SSE2-NEXT:    pextrw $1, %xmm1, %ecx
+; SSE2-NEXT:    pinsrw $4, %ecx, %xmm0
+; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE2-NEXT:    pextrw $2, %xmm1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm2
+; SSE2-NEXT:    pinsrw $2, %eax, %xmm2
+; SSE2-NEXT:    pextrw $3, %xmm1, %ecx
+; SSE2-NEXT:    pinsrw $4, %ecx, %xmm2
+; SSE2-NEXT:    pinsrw $6, %eax, %xmm2
+; SSE2-NEXT:    pextrw $4, %xmm1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    pinsrw $2, %eax, %xmm3
+; SSE2-NEXT:    pextrw $5, %xmm1, %ecx
+; SSE2-NEXT:    pinsrw $4, %ecx, %xmm3
+; SSE2-NEXT:    pinsrw $6, %eax, %xmm3
+; SSE2-NEXT:    pextrw $6, %xmm1, %ecx
+; SSE2-NEXT:    movd %ecx, %xmm4
+; SSE2-NEXT:    pinsrw $2, %eax, %xmm4
+; SSE2-NEXT:    pextrw $7, %xmm1, %ecx
+; SSE2-NEXT:    pinsrw $4, %ecx, %xmm4
+; SSE2-NEXT:    pinsrw $6, %eax, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm3, 32(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: PR45604:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    movdqa (%rsi), %xmm1
+; SSSE3-NEXT:    movd %xmm1, %eax
+; SSSE3-NEXT:    movzwl %ax, %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movl $11, %eax
+; SSSE3-NEXT:    pinsrw $2, %eax, %xmm0
+; SSSE3-NEXT:    pextrw $1, %xmm1, %ecx
+; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm0
+; SSSE3-NEXT:    pinsrw $6, %eax, %xmm0
+; SSSE3-NEXT:    pextrw $2, %xmm1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm2
+; SSSE3-NEXT:    pinsrw $2, %eax, %xmm2
+; SSSE3-NEXT:    pextrw $3, %xmm1, %ecx
+; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm2
+; SSSE3-NEXT:    pinsrw $6, %eax, %xmm2
+; SSSE3-NEXT:    pextrw $4, %xmm1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm3
+; SSSE3-NEXT:    pinsrw $2, %eax, %xmm3
+; SSSE3-NEXT:    pextrw $5, %xmm1, %ecx
+; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm3
+; SSSE3-NEXT:    pinsrw $6, %eax, %xmm3
+; SSSE3-NEXT:    pextrw $6, %xmm1, %ecx
+; SSSE3-NEXT:    movd %ecx, %xmm4
+; SSSE3-NEXT:    pinsrw $2, %eax, %xmm4
+; SSSE3-NEXT:    pextrw $7, %xmm1, %ecx
+; SSSE3-NEXT:    pinsrw $4, %ecx, %xmm4
+; SSSE3-NEXT:    pinsrw $6, %eax, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, 48(%rdi)
+; SSSE3-NEXT:    movdqa %xmm3, 32(%rdi)
+; SSSE3-NEXT:    movdqa %xmm2, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR45604:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa (%rsi), %xmm1
+; SSE41-NEXT:    pextrw $2, %xmm1, %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    movl $11, %eax
+; SSE41-NEXT:    pinsrw $2, %eax, %xmm0
+; SSE41-NEXT:    pextrw $3, %xmm1, %ecx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
+; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE41-NEXT:    pextrw $4, %xmm1, %ecx
+; SSE41-NEXT:    movd %ecx, %xmm2
+; SSE41-NEXT:    pinsrw $2, %eax, %xmm2
+; SSE41-NEXT:    pextrw $5, %xmm1, %ecx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm2
+; SSE41-NEXT:    pinsrw $6, %eax, %xmm2
+; SSE41-NEXT:    pextrw $6, %xmm1, %ecx
+; SSE41-NEXT:    movd %ecx, %xmm3
+; SSE41-NEXT:    pinsrw $2, %eax, %xmm3
+; SSE41-NEXT:    pextrw $7, %xmm1, %ecx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm3
+; SSE41-NEXT:    pinsrw $6, %eax, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3,4,5,6,7]
+; SSE41-NEXT:    pinsrw $2, %eax, %xmm4
+; SSE41-NEXT:    pextrw $1, %xmm1, %ecx
+; SSE41-NEXT:    pinsrw $4, %ecx, %xmm4
+; SSE41-NEXT:    pinsrw $6, %eax, %xmm4
+; SSE41-NEXT:    movdqa %xmm4, (%rdi)
+; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
+; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: PR45604:
 ; AVX1:       # %bb.0:


        


More information about the llvm-commits mailing list