[llvm] r366441 - [X86] EltsFromConsecutiveLoads - support common source loads

Fri Jul 19 11:06:58 PDT 2019

Reduction:

$ cat SkOpts_sse41-c0ad66.reduced.cpp
extern "C" void *memcpy(void *, const void *, unsigned long);
template <typename a> using b = a __attribute__((ext_vector_type(4)));
using c = b<char>;
template <typename b, typename a> b d(a e, unsigned) {
  b f;
  f[2] = 2;
  memcpy(&f, e, 2);
  return f;
}
void g(c);
char h;
long i;
void j() { g(d<c>(&h, i)); }

$ clang -cc1 -emit-obj -O2 SkOpts_sse41-c0ad66.reduced.cpp

On Thu, Jul 18, 2019 at 2:29 PM Reid Kleckner <rnk at google.com> wrote:

> This caused some crashes building some vector code in skia:
> https://ci.chromium.org/p/chromium/builders/ci/ToTLinux/7145
> I reverted and started creduce, but I'm leaving for the day soon.
>
> On Thu, Jul 18, 2019 at 7:33 AM Simon Pilgrim via llvm-commits <
> llvm-commits at lists.llvm.org> wrote:
>
>> Author: rksimon
>> Date: Thu Jul 18 07:33:25 2019
>> New Revision: 366441
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=366441&view=rev
>> Log:
>> [X86] EltsFromConsecutiveLoads - support common source loads
>>
>> This patch enables us to find the source loads for each element,
>> splitting them into a Load and ByteOffset, and attempts to recognise
>> consecutive loads that are in fact from the same source load.
>>
>> A helper function, findEltLoadSrc, recurses to find a LoadSDNode and
>> determines the element's byte offset within it. When attempting to match
>> consecutive loads, byte offsetted loads then attempt to matched against a
>> previous load that has already been confirmed to be a consecutive match.
>>
>> Next step towards PR16739 - after this we just need to account for
>> shuffling/repeated elements to create a vector load + shuffle.
>>
>> Differential Revision: https://reviews.llvm.org/D64551
>>
>> Modified:
>>     llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>>     llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
>>     llvm/trunk/test/CodeGen/X86/load-partial.ll
>>
>> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=366441&r1=366440&r2=366441&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Jul 18 07:33:25 2019
>> @@ -7504,6 +7504,46 @@ static SDValue LowerAsSplatVectorLoad(SD
>>    return SDValue();
>>  }
>>
>> +// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
>> +static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t
>> &ByteOffset) {
>> +  if (ISD::isNON_EXTLoad(Elt.getNode())) {
>> +    Ld = cast<LoadSDNode>(Elt);
>> +    ByteOffset = 0;
>> +    return true;
>> +  }
>> +
>> +  switch (Elt.getOpcode()) {
>> +  case ISD::BITCAST:
>> +  case ISD::TRUNCATE:
>> +  case ISD::SCALAR_TO_VECTOR:
>> +    return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
>> +  case ISD::SRL:
>> +    if (isa<ConstantSDNode>(Elt.getOperand(1))) {
>> +      uint64_t Idx = Elt.getConstantOperandVal(1);
>> +      if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld,
>> ByteOffset)) {
>> +        ByteOffset += Idx / 8;
>> +        return true;
>> +      }
>> +    }
>> +    break;
>> +  case ISD::EXTRACT_VECTOR_ELT:
>> +    if (isa<ConstantSDNode>(Elt.getOperand(1))) {
>> +      SDValue Src = Elt.getOperand(0);
>> +      unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
>> +      unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
>> +      if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
>> +          findEltLoadSrc(Src, Ld, ByteOffset)) {
>> +        uint64_t Idx = Elt.getConstantOperandVal(1);
>> +        ByteOffset += Idx * (SrcSizeInBits / 8);
>> +        return true;
>> +      }
>> +    }
>> +    break;
>> +  }
>> +
>> +  return false;
>> +}
>> +
>>  /// Given the initializing elements 'Elts' of a vector of type 'VT', see
>> if the
>>  /// elements can be replaced by a single large load which has the same
>> value as
>>  /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
>> @@ -7521,6 +7561,7 @@ static SDValue EltsFromConsecutiveLoads(
>>    APInt UndefMask = APInt::getNullValue(NumElems);
>>
>>    SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
>> +  SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
>>
>>    // For each element in the initializer, see if we've found a load,
>> zero or an
>>    // undef.
>> @@ -7539,13 +7580,17 @@ static SDValue EltsFromConsecutiveLoads(
>>
>>      // Each loaded element must be the correct fractional portion of the
>>      // requested vector load.
>> -    if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
>> +    unsigned EltSizeInBits = Elt.getValueSizeInBits();
>> +    if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
>>        return SDValue();
>>
>> -    if (!ISD::isNON_EXTLoad(Elt.getNode()))
>> +    if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]))
>>        return SDValue();
>> +    assert(0 <= ByteOffsets[i] &&
>> +           ((ByteOffsets[i] * 8) + EltSizeInBits) <=
>> +               Loads[i]->getValueSizeInBits(0) &&
>> +           "Element offset outside of load bounds");
>>
>> -    Loads[i] = cast<LoadSDNode>(Elt);
>>      LoadMask.setBit(i);
>>      LastLoadedElt = i;
>>    }
>> @@ -7575,6 +7620,20 @@ static SDValue EltsFromConsecutiveLoads(
>>    int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) *
>> BaseSizeInBits;
>>    assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
>>
>> +  // Check to see if the element's load is consecutive to the base load
>> +  // or offset from a previous (already checked) load.
>> +  auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
>> +    LoadSDNode *Ld = Loads[EltIdx];
>> +    int64_t ByteOffset = ByteOffsets[EltIdx];
>> +    if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
>> +      int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
>> +      return (0 <= BaseIdx && BaseIdx < (int)NumElems &&
>> LoadMask[BaseIdx] &&
>> +              Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
>> +    }
>> +    return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
>> +                                              EltIdx - FirstLoadedElt);
>> +  };
>> +
>>    // Consecutive loads can contain UNDEFS but not ZERO elements.
>>    // Consecutive loads with UNDEFs and ZEROs elements require a
>>    // an additional shuffle stage to clear the ZERO elements.
>> @@ -7582,8 +7641,7 @@ static SDValue EltsFromConsecutiveLoads(
>>    bool IsConsecutiveLoadWithZeros = true;
>>    for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
>>      if (LoadMask[i]) {
>> -      if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase,
>> BaseSizeInBytes,
>> -                                              i - FirstLoadedElt)) {
>> +      if (!CheckConsecutiveLoad(LDBase, i)) {
>>          IsConsecutiveLoad = false;
>>          IsConsecutiveLoadWithZeros = false;
>>          break;
>>
>> Modified: llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll?rev=366441&r1=366440&r2=366441&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
>> (original)
>> +++ llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll Thu
>> Jul 18 07:33:25 2019
>> @@ -985,99 +985,54 @@ define <32 x i8> @_clearupper32xi8b(<32
>>  ; AVX1-LABEL: _clearupper32xi8b:
>>  ; AVX1:       # %bb.0:
>>  ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
>> -; AVX1-NEXT:    movq -{{[0-9]+}}(%rsp), %r9
>> -; AVX1-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
>> -; AVX1-NEXT:    movq %r9, %r8
>> -; AVX1-NEXT:    shrq $56, %r8
>> -; AVX1-NEXT:    andl $15, %r8d
>> -; AVX1-NEXT:    movq %rcx, %rsi
>> -; AVX1-NEXT:    movq %rcx, %rdi
>> -; AVX1-NEXT:    movq %rcx, %rdx
>> -; AVX1-NEXT:    movq %rcx, %rax
>> -; AVX1-NEXT:    shrq $32, %rax
>> -; AVX1-NEXT:    andl $15, %eax
>> -; AVX1-NEXT:    shlq $32, %rax
>> -; AVX1-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
>> -; AVX1-NEXT:    orq %rax, %rcx
>> -; AVX1-NEXT:    movq %r9, %rax
>> -; AVX1-NEXT:    shrq $48, %rax
>> -; AVX1-NEXT:    andl $15, %eax
>> -; AVX1-NEXT:    shrq $40, %rdx
>> -; AVX1-NEXT:    andl $15, %edx
>> -; AVX1-NEXT:    shlq $40, %rdx
>> -; AVX1-NEXT:    orq %rcx, %rdx
>> -; AVX1-NEXT:    movq %r9, %rcx
>> -; AVX1-NEXT:    shrq $40, %rcx
>> -; AVX1-NEXT:    andl $15, %ecx
>> -; AVX1-NEXT:    shrq $48, %rdi
>> -; AVX1-NEXT:    andl $15, %edi
>> -; AVX1-NEXT:    shlq $48, %rdi
>> -; AVX1-NEXT:    orq %rdx, %rdi
>> -; AVX1-NEXT:    movq %r9, %rdx
>> -; AVX1-NEXT:    shrq $32, %rdx
>> -; AVX1-NEXT:    andl $15, %edx
>> -; AVX1-NEXT:    shrq $56, %rsi
>> -; AVX1-NEXT:    andl $15, %esi
>> -; AVX1-NEXT:    shlq $56, %rsi
>> -; AVX1-NEXT:    orq %rdi, %rsi
>> -; AVX1-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
>> -; AVX1-NEXT:    shlq $32, %rdx
>> -; AVX1-NEXT:    andl $252645135, %r9d # imm = 0xF0F0F0F
>> -; AVX1-NEXT:    orq %rdx, %r9
>> -; AVX1-NEXT:    shlq $40, %rcx
>> -; AVX1-NEXT:    orq %r9, %rcx
>> -; AVX1-NEXT:    shlq $48, %rax
>> -; AVX1-NEXT:    orq %rcx, %rax
>> -; AVX1-NEXT:    shlq $56, %r8
>> -; AVX1-NEXT:    orq %rax, %r8
>> -; AVX1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
>> -; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
>> -; AVX1-NEXT:    vmovq %xmm0, %rax
>> -; AVX1-NEXT:    movq %rax, %r8
>> -; AVX1-NEXT:    movq %rax, %r9
>> +; AVX1-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
>> +; AVX1-NEXT:    movq %rax, %rcx
>> +; AVX1-NEXT:    movq %rax, %rdx
>>  ; AVX1-NEXT:    movq %rax, %rsi
>>  ; AVX1-NEXT:    movq %rax, %rdi
>> -; AVX1-NEXT:    movl %eax, %ecx
>> -; AVX1-NEXT:    movl %eax, %edx
>> -; AVX1-NEXT:    vmovd %eax, %xmm1
>> -; AVX1-NEXT:    shrl $8, %eax
>> -; AVX1-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
>> -; AVX1-NEXT:    shrl $16, %edx
>> -; AVX1-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
>> -; AVX1-NEXT:    shrl $24, %ecx
>> -; AVX1-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
>>  ; AVX1-NEXT:    shrq $32, %rdi
>> -; AVX1-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
>> +; AVX1-NEXT:    andl $15, %edi
>> +; AVX1-NEXT:    shlq $32, %rdi
>> +; AVX1-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
>> +; AVX1-NEXT:    orq %rdi, %rax
>> +; AVX1-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi
>>  ; AVX1-NEXT:    shrq $40, %rsi
>> -; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
>> -; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2
>> -; AVX1-NEXT:    shrq $48, %r9
>> -; AVX1-NEXT:    vpinsrb $6, %r9d, %xmm1, %xmm1
>> -; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
>> -; AVX1-NEXT:    shrq $56, %r8
>> -; AVX1-NEXT:    vpinsrb $7, %r8d, %xmm1, %xmm0
>> -; AVX1-NEXT:    movl %eax, %ecx
>> -; AVX1-NEXT:    shrl $8, %ecx
>> -; AVX1-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
>> -; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
>> -; AVX1-NEXT:    movl %eax, %ecx
>> -; AVX1-NEXT:    shrl $16, %ecx
>> -; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
>> -; AVX1-NEXT:    movl %eax, %ecx
>> -; AVX1-NEXT:    shrl $24, %ecx
>> -; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
>> -; AVX1-NEXT:    movq %rax, %rcx
>> +; AVX1-NEXT:    andl $15, %esi
>> +; AVX1-NEXT:    shlq $40, %rsi
>> +; AVX1-NEXT:    orq %rax, %rsi
>> +; AVX1-NEXT:    movq %rdi, %rax
>> +; AVX1-NEXT:    shrq $48, %rdx
>> +; AVX1-NEXT:    andl $15, %edx
>> +; AVX1-NEXT:    shlq $48, %rdx
>> +; AVX1-NEXT:    orq %rsi, %rdx
>> +; AVX1-NEXT:    movq %rdi, %rsi
>> +; AVX1-NEXT:    shrq $56, %rcx
>> +; AVX1-NEXT:    andl $15, %ecx
>> +; AVX1-NEXT:    shlq $56, %rcx
>> +; AVX1-NEXT:    orq %rdx, %rcx
>> +; AVX1-NEXT:    movq %rdi, %rdx
>> +; AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
>> +; AVX1-NEXT:    movq %rdi, %rcx
>>  ; AVX1-NEXT:    shrq $32, %rcx
>> -; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
>> -; AVX1-NEXT:    movq %rax, %rcx
>> -; AVX1-NEXT:    shrq $40, %rcx
>> -; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
>> -; AVX1-NEXT:    movq %rax, %rcx
>> -; AVX1-NEXT:    shrq $48, %rcx
>> -; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
>> -; AVX1-NEXT:    vmovq %xmm2, %rcx
>> +; AVX1-NEXT:    andl $15, %ecx
>> +; AVX1-NEXT:    shlq $32, %rcx
>> +; AVX1-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
>> +; AVX1-NEXT:    orq %rcx, %rdi
>> +; AVX1-NEXT:    shrq $40, %rdx
>> +; AVX1-NEXT:    andl $15, %edx
>> +; AVX1-NEXT:    shlq $40, %rdx
>> +; AVX1-NEXT:    orq %rdi, %rdx
>> +; AVX1-NEXT:    shrq $48, %rsi
>> +; AVX1-NEXT:    andl $15, %esi
>> +; AVX1-NEXT:    shlq $48, %rsi
>> +; AVX1-NEXT:    orq %rdx, %rsi
>>  ; AVX1-NEXT:    shrq $56, %rax
>> -; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
>> +; AVX1-NEXT:    andl $15, %eax
>> +; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
>> +; AVX1-NEXT:    shlq $56, %rax
>> +; AVX1-NEXT:    orq %rsi, %rax
>> +; AVX1-NEXT:    vmovq %xmm0, %rcx
>> +; AVX1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
>>  ; AVX1-NEXT:    movl %ecx, %eax
>>  ; AVX1-NEXT:    shrl $8, %eax
>>  ; AVX1-NEXT:    vmovd %ecx, %xmm1
>> @@ -1097,129 +1052,85 @@ define <32 x i8> @_clearupper32xi8b(<32
>>  ; AVX1-NEXT:    movq %rcx, %rax
>>  ; AVX1-NEXT:    shrq $48, %rax
>>  ; AVX1-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
>> -; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
>> +; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
>>  ; AVX1-NEXT:    shrq $56, %rcx
>> -; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm0
>>  ; AVX1-NEXT:    movl %eax, %ecx
>>  ; AVX1-NEXT:    shrl $8, %ecx
>> -; AVX1-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
>> -; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
>> +; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
>>  ; AVX1-NEXT:    movl %eax, %ecx
>>  ; AVX1-NEXT:    shrl $16, %ecx
>> -; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
>>  ; AVX1-NEXT:    movl %eax, %ecx
>>  ; AVX1-NEXT:    shrl $24, %ecx
>> -; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
>>  ; AVX1-NEXT:    movq %rax, %rcx
>>  ; AVX1-NEXT:    shrq $32, %rcx
>> -; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
>>  ; AVX1-NEXT:    movq %rax, %rcx
>>  ; AVX1-NEXT:    shrq $40, %rcx
>> -; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
>>  ; AVX1-NEXT:    movq %rax, %rcx
>>  ; AVX1-NEXT:    shrq $48, %rcx
>> -; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
>>  ; AVX1-NEXT:    shrq $56, %rax
>> -; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
>> +; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
>> +; AVX1-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm1
>>  ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
>>  ; AVX1-NEXT:    retq
>>  ;
>>  ; AVX2-LABEL: _clearupper32xi8b:
>>  ; AVX2:       # %bb.0:
>>  ; AVX2-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
>> -; AVX2-NEXT:    movq -{{[0-9]+}}(%rsp), %r9
>> -; AVX2-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
>> -; AVX2-NEXT:    movq %r9, %r8
>> -; AVX2-NEXT:    shrq $56, %r8
>> -; AVX2-NEXT:    andl $15, %r8d
>> -; AVX2-NEXT:    movq %rcx, %rsi
>> -; AVX2-NEXT:    movq %rcx, %rdi
>> -; AVX2-NEXT:    movq %rcx, %rdx
>> -; AVX2-NEXT:    movq %rcx, %rax
>> -; AVX2-NEXT:    shrq $32, %rax
>> -; AVX2-NEXT:    andl $15, %eax
>> -; AVX2-NEXT:    shlq $32, %rax
>> -; AVX2-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
>> -; AVX2-NEXT:    orq %rax, %rcx
>> -; AVX2-NEXT:    movq %r9, %rax
>> -; AVX2-NEXT:    shrq $48, %rax
>> -; AVX2-NEXT:    andl $15, %eax
>> -; AVX2-NEXT:    shrq $40, %rdx
>> -; AVX2-NEXT:    andl $15, %edx
>> -; AVX2-NEXT:    shlq $40, %rdx
>> -; AVX2-NEXT:    orq %rcx, %rdx
>> -; AVX2-NEXT:    movq %r9, %rcx
>> -; AVX2-NEXT:    shrq $40, %rcx
>> -; AVX2-NEXT:    andl $15, %ecx
>> -; AVX2-NEXT:    shrq $48, %rdi
>> -; AVX2-NEXT:    andl $15, %edi
>> -; AVX2-NEXT:    shlq $48, %rdi
>> -; AVX2-NEXT:    orq %rdx, %rdi
>> -; AVX2-NEXT:    movq %r9, %rdx
>> -; AVX2-NEXT:    shrq $32, %rdx
>> -; AVX2-NEXT:    andl $15, %edx
>> -; AVX2-NEXT:    shrq $56, %rsi
>> -; AVX2-NEXT:    andl $15, %esi
>> -; AVX2-NEXT:    shlq $56, %rsi
>> -; AVX2-NEXT:    orq %rdi, %rsi
>> -; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
>> -; AVX2-NEXT:    shlq $32, %rdx
>> -; AVX2-NEXT:    andl $252645135, %r9d # imm = 0xF0F0F0F
>> -; AVX2-NEXT:    orq %rdx, %r9
>> -; AVX2-NEXT:    shlq $40, %rcx
>> -; AVX2-NEXT:    orq %r9, %rcx
>> -; AVX2-NEXT:    shlq $48, %rax
>> -; AVX2-NEXT:    orq %rcx, %rax
>> -; AVX2-NEXT:    shlq $56, %r8
>> -; AVX2-NEXT:    orq %rax, %r8
>> -; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
>> -; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
>> -; AVX2-NEXT:    vmovq %xmm0, %rax
>> -; AVX2-NEXT:    movq %rax, %r8
>> -; AVX2-NEXT:    movq %rax, %r9
>> +; AVX2-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
>> +; AVX2-NEXT:    movq %rax, %rcx
>> +; AVX2-NEXT:    movq %rax, %rdx
>>  ; AVX2-NEXT:    movq %rax, %rsi
>>  ; AVX2-NEXT:    movq %rax, %rdi
>> -; AVX2-NEXT:    movl %eax, %ecx
>> -; AVX2-NEXT:    movl %eax, %edx
>> -; AVX2-NEXT:    vmovd %eax, %xmm1
>> -; AVX2-NEXT:    shrl $8, %eax
>> -; AVX2-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
>> -; AVX2-NEXT:    shrl $16, %edx
>> -; AVX2-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
>> -; AVX2-NEXT:    shrl $24, %ecx
>> -; AVX2-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
>>  ; AVX2-NEXT:    shrq $32, %rdi
>> -; AVX2-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
>> +; AVX2-NEXT:    andl $15, %edi
>> +; AVX2-NEXT:    shlq $32, %rdi
>> +; AVX2-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
>> +; AVX2-NEXT:    orq %rdi, %rax
>> +; AVX2-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi
>>  ; AVX2-NEXT:    shrq $40, %rsi
>> -; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
>> -; AVX2-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2
>> -; AVX2-NEXT:    shrq $48, %r9
>> -; AVX2-NEXT:    vpinsrb $6, %r9d, %xmm1, %xmm1
>> -; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
>> -; AVX2-NEXT:    shrq $56, %r8
>> -; AVX2-NEXT:    vpinsrb $7, %r8d, %xmm1, %xmm0
>> -; AVX2-NEXT:    movl %eax, %ecx
>> -; AVX2-NEXT:    shrl $8, %ecx
>> -; AVX2-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
>> -; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
>> -; AVX2-NEXT:    movl %eax, %ecx
>> -; AVX2-NEXT:    shrl $16, %ecx
>> -; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
>> -; AVX2-NEXT:    movl %eax, %ecx
>> -; AVX2-NEXT:    shrl $24, %ecx
>> -; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
>> -; AVX2-NEXT:    movq %rax, %rcx
>> +; AVX2-NEXT:    andl $15, %esi
>> +; AVX2-NEXT:    shlq $40, %rsi
>> +; AVX2-NEXT:    orq %rax, %rsi
>> +; AVX2-NEXT:    movq %rdi, %rax
>> +; AVX2-NEXT:    shrq $48, %rdx
>> +; AVX2-NEXT:    andl $15, %edx
>> +; AVX2-NEXT:    shlq $48, %rdx
>> +; AVX2-NEXT:    orq %rsi, %rdx
>> +; AVX2-NEXT:    movq %rdi, %rsi
>> +; AVX2-NEXT:    shrq $56, %rcx
>> +; AVX2-NEXT:    andl $15, %ecx
>> +; AVX2-NEXT:    shlq $56, %rcx
>> +; AVX2-NEXT:    orq %rdx, %rcx
>> +; AVX2-NEXT:    movq %rdi, %rdx
>> +; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
>> +; AVX2-NEXT:    movq %rdi, %rcx
>>  ; AVX2-NEXT:    shrq $32, %rcx
>> -; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
>> -; AVX2-NEXT:    movq %rax, %rcx
>> -; AVX2-NEXT:    shrq $40, %rcx
>> -; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
>> -; AVX2-NEXT:    movq %rax, %rcx
>> -; AVX2-NEXT:    shrq $48, %rcx
>> -; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
>> -; AVX2-NEXT:    vmovq %xmm2, %rcx
>> +; AVX2-NEXT:    andl $15, %ecx
>> +; AVX2-NEXT:    shlq $32, %rcx
>> +; AVX2-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
>> +; AVX2-NEXT:    orq %rcx, %rdi
>> +; AVX2-NEXT:    shrq $40, %rdx
>> +; AVX2-NEXT:    andl $15, %edx
>> +; AVX2-NEXT:    shlq $40, %rdx
>> +; AVX2-NEXT:    orq %rdi, %rdx
>> +; AVX2-NEXT:    shrq $48, %rsi
>> +; AVX2-NEXT:    andl $15, %esi
>> +; AVX2-NEXT:    shlq $48, %rsi
>> +; AVX2-NEXT:    orq %rdx, %rsi
>>  ; AVX2-NEXT:    shrq $56, %rax
>> -; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
>> +; AVX2-NEXT:    andl $15, %eax
>> +; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
>> +; AVX2-NEXT:    shlq $56, %rax
>> +; AVX2-NEXT:    orq %rsi, %rax
>> +; AVX2-NEXT:    vmovq %xmm0, %rcx
>> +; AVX2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
>>  ; AVX2-NEXT:    movl %ecx, %eax
>>  ; AVX2-NEXT:    shrl $8, %eax
>>  ; AVX2-NEXT:    vmovd %ecx, %xmm1
>> @@ -1239,30 +1150,31 @@ define <32 x i8> @_clearupper32xi8b(<32
>>  ; AVX2-NEXT:    movq %rcx, %rax
>>  ; AVX2-NEXT:    shrq $48, %rax
>>  ; AVX2-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
>> -; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
>> +; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
>>  ; AVX2-NEXT:    shrq $56, %rcx
>> -; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm0
>>  ; AVX2-NEXT:    movl %eax, %ecx
>>  ; AVX2-NEXT:    shrl $8, %ecx
>> -; AVX2-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
>> -; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
>> +; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
>>  ; AVX2-NEXT:    movl %eax, %ecx
>>  ; AVX2-NEXT:    shrl $16, %ecx
>> -; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
>>  ; AVX2-NEXT:    movl %eax, %ecx
>>  ; AVX2-NEXT:    shrl $24, %ecx
>> -; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
>>  ; AVX2-NEXT:    movq %rax, %rcx
>>  ; AVX2-NEXT:    shrq $32, %rcx
>> -; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
>>  ; AVX2-NEXT:    movq %rax, %rcx
>>  ; AVX2-NEXT:    shrq $40, %rcx
>> -; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
>>  ; AVX2-NEXT:    movq %rax, %rcx
>>  ; AVX2-NEXT:    shrq $48, %rcx
>> -; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
>>  ; AVX2-NEXT:    shrq $56, %rax
>> -; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
>> +; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
>> +; AVX2-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm1
>>  ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
>>  ; AVX2-NEXT:    retq
>>    %x4  = bitcast <32 x i8> %0 to <64 x i4>
>>
>> Modified: llvm/trunk/test/CodeGen/X86/load-partial.ll
>> URL:
>> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/load-partial.ll?rev=366441&r1=366440&r2=366441&view=diff
>>
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/load-partial.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/load-partial.ll Thu Jul 18 07:33:25 2019
>> @@ -54,32 +54,14 @@ define <8 x float> @load_float8_float3(<
>>  }
>>
>>  define <4 x float> @load_float4_float3_as_float2_float(<4 x float>*
>> nocapture readonly dereferenceable(16)) {
>> -; SSE2-LABEL: load_float4_float3_as_float2_float:
>> -; SSE2:       # %bb.0:
>> -; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
>> -; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>> -; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
>> -; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
>> -; SSE2-NEXT:    retq
>> -;
>> -; SSSE3-LABEL: load_float4_float3_as_float2_float:
>> -; SSSE3:       # %bb.0:
>> -; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
>> -; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>> -; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
>> -; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
>> -; SSSE3-NEXT:    retq
>> -;
>> -; SSE41-LABEL: load_float4_float3_as_float2_float:
>> -; SSE41:       # %bb.0:
>> -; SSE41-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
>> -; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
>> -; SSE41-NEXT:    retq
>> +; SSE-LABEL: load_float4_float3_as_float2_float:
>> +; SSE:       # %bb.0:
>> +; SSE-NEXT:    movups (%rdi), %xmm0
>> +; SSE-NEXT:    retq
>>  ;
>>  ; AVX-LABEL: load_float4_float3_as_float2_float:
>>  ; AVX:       # %bb.0:
>> -; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
>> -; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
>> +; AVX-NEXT:    vmovups (%rdi), %xmm0
>>  ; AVX-NEXT:    retq
>>    %2 = bitcast <4 x float>* %0 to <2 x float>*
>>    %3 = load <2 x float>, <2 x float>* %2, align 4
>> @@ -94,36 +76,14 @@ define <4 x float> @load_float4_float3_a
>>  }
>>
>>  define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture
>> readonly dereferenceable(16)) {
>> -; SSE2-LABEL: load_float4_float3_trunc:
>> -; SSE2:       # %bb.0:
>> -; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
>> -; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>> -; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
>> -; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>> -; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
>> -; SSE2-NEXT:    retq
>> -;
>> -; SSSE3-LABEL: load_float4_float3_trunc:
>> -; SSSE3:       # %bb.0:
>> -; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
>> -; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>> -; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
>> -; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
>> -; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
>> -; SSSE3-NEXT:    retq
>> -;
>> -; SSE41-LABEL: load_float4_float3_trunc:
>> -; SSE41:       # %bb.0:
>> -; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
>> -; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
>> -; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
>> -; SSE41-NEXT:    retq
>> +; SSE-LABEL: load_float4_float3_trunc:
>> +; SSE:       # %bb.0:
>> +; SSE-NEXT:    movaps (%rdi), %xmm0
>> +; SSE-NEXT:    retq
>>  ;
>>  ; AVX-LABEL: load_float4_float3_trunc:
>>  ; AVX:       # %bb.0:
>> -; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
>> -; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
>> -; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
>> +; AVX-NEXT:    vmovaps (%rdi), %xmm0
>>  ; AVX-NEXT:    retq
>>    %2 = bitcast <4 x float>* %0 to i64*
>>    %3 = load i64, i64* %2, align 16
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190719/cf942592/attachment.html>