[llvm] r366441 - [X86] EltsFromConsecutiveLoads - support common source loads
Reid Kleckner via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 18 14:29:14 PDT 2019
This caused some crashes building some vector code in skia:
https://ci.chromium.org/p/chromium/builders/ci/ToTLinux/7145
I reverted and started creduce, but I'm leaving for the day soon.
On Thu, Jul 18, 2019 at 7:33 AM Simon Pilgrim via llvm-commits <
llvm-commits at lists.llvm.org> wrote:
> Author: rksimon
> Date: Thu Jul 18 07:33:25 2019
> New Revision: 366441
>
> URL: http://llvm.org/viewvc/llvm-project?rev=366441&view=rev
> Log:
> [X86] EltsFromConsecutiveLoads - support common source loads
>
> This patch enables us to find the source loads for each element, splitting
> them into a Load and ByteOffset, and attempts to recognise consecutive
> loads that are in fact from the same source load.
>
> A helper function, findEltLoadSrc, recurses to find a LoadSDNode and
> determines the element's byte offset within it. When attempting to match
> consecutive loads, byte offsetted loads then attempt to matched against a
> previous load that has already been confirmed to be a consecutive match.
>
> Next step towards PR16739 - after this we just need to account for
> shuffling/repeated elements to create a vector load + shuffle.
>
> Differential Revision: https://reviews.llvm.org/D64551
>
> Modified:
> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
> llvm/trunk/test/CodeGen/X86/load-partial.ll
>
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=366441&r1=366440&r2=366441&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Jul 18 07:33:25 2019
> @@ -7504,6 +7504,46 @@ static SDValue LowerAsSplatVectorLoad(SD
> return SDValue();
> }
>
> +// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
> +static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t
> &ByteOffset) {
> + if (ISD::isNON_EXTLoad(Elt.getNode())) {
> + Ld = cast<LoadSDNode>(Elt);
> + ByteOffset = 0;
> + return true;
> + }
> +
> + switch (Elt.getOpcode()) {
> + case ISD::BITCAST:
> + case ISD::TRUNCATE:
> + case ISD::SCALAR_TO_VECTOR:
> + return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
> + case ISD::SRL:
> + if (isa<ConstantSDNode>(Elt.getOperand(1))) {
> + uint64_t Idx = Elt.getConstantOperandVal(1);
> + if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld,
> ByteOffset)) {
> + ByteOffset += Idx / 8;
> + return true;
> + }
> + }
> + break;
> + case ISD::EXTRACT_VECTOR_ELT:
> + if (isa<ConstantSDNode>(Elt.getOperand(1))) {
> + SDValue Src = Elt.getOperand(0);
> + unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
> + unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
> + if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
> + findEltLoadSrc(Src, Ld, ByteOffset)) {
> + uint64_t Idx = Elt.getConstantOperandVal(1);
> + ByteOffset += Idx * (SrcSizeInBits / 8);
> + return true;
> + }
> + }
> + break;
> + }
> +
> + return false;
> +}
> +
> /// Given the initializing elements 'Elts' of a vector of type 'VT', see
> if the
> /// elements can be replaced by a single large load which has the same
> value as
> /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
> @@ -7521,6 +7561,7 @@ static SDValue EltsFromConsecutiveLoads(
> APInt UndefMask = APInt::getNullValue(NumElems);
>
> SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
> + SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
>
> // For each element in the initializer, see if we've found a load, zero
> or an
> // undef.
> @@ -7539,13 +7580,17 @@ static SDValue EltsFromConsecutiveLoads(
>
> // Each loaded element must be the correct fractional portion of the
> // requested vector load.
> - if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
> + unsigned EltSizeInBits = Elt.getValueSizeInBits();
> + if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
> return SDValue();
>
> - if (!ISD::isNON_EXTLoad(Elt.getNode()))
> + if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]))
> return SDValue();
> + assert(0 <= ByteOffsets[i] &&
> + ((ByteOffsets[i] * 8) + EltSizeInBits) <=
> + Loads[i]->getValueSizeInBits(0) &&
> + "Element offset outside of load bounds");
>
> - Loads[i] = cast<LoadSDNode>(Elt);
> LoadMask.setBit(i);
> LastLoadedElt = i;
> }
> @@ -7575,6 +7620,20 @@ static SDValue EltsFromConsecutiveLoads(
> int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) *
> BaseSizeInBits;
> assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
>
> + // Check to see if the element's load is consecutive to the base load
> + // or offset from a previous (already checked) load.
> + auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
> + LoadSDNode *Ld = Loads[EltIdx];
> + int64_t ByteOffset = ByteOffsets[EltIdx];
> + if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
> + int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
> + return (0 <= BaseIdx && BaseIdx < (int)NumElems &&
> LoadMask[BaseIdx] &&
> + Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
> + }
> + return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
> + EltIdx - FirstLoadedElt);
> + };
> +
> // Consecutive loads can contain UNDEFS but not ZERO elements.
> // Consecutive loads with UNDEFs and ZEROs elements require a
> // an additional shuffle stage to clear the ZERO elements.
> @@ -7582,8 +7641,7 @@ static SDValue EltsFromConsecutiveLoads(
> bool IsConsecutiveLoadWithZeros = true;
> for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
> if (LoadMask[i]) {
> - if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase,
> BaseSizeInBytes,
> - i - FirstLoadedElt)) {
> + if (!CheckConsecutiveLoad(LDBase, i)) {
> IsConsecutiveLoad = false;
> IsConsecutiveLoadWithZeros = false;
> break;
>
> Modified: llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll?rev=366441&r1=366440&r2=366441&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
> (original)
> +++ llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll Thu Jul
> 18 07:33:25 2019
> @@ -985,99 +985,54 @@ define <32 x i8> @_clearupper32xi8b(<32
> ; AVX1-LABEL: _clearupper32xi8b:
> ; AVX1: # %bb.0:
> ; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
> -; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r9
> -; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
> -; AVX1-NEXT: movq %r9, %r8
> -; AVX1-NEXT: shrq $56, %r8
> -; AVX1-NEXT: andl $15, %r8d
> -; AVX1-NEXT: movq %rcx, %rsi
> -; AVX1-NEXT: movq %rcx, %rdi
> -; AVX1-NEXT: movq %rcx, %rdx
> -; AVX1-NEXT: movq %rcx, %rax
> -; AVX1-NEXT: shrq $32, %rax
> -; AVX1-NEXT: andl $15, %eax
> -; AVX1-NEXT: shlq $32, %rax
> -; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
> -; AVX1-NEXT: orq %rax, %rcx
> -; AVX1-NEXT: movq %r9, %rax
> -; AVX1-NEXT: shrq $48, %rax
> -; AVX1-NEXT: andl $15, %eax
> -; AVX1-NEXT: shrq $40, %rdx
> -; AVX1-NEXT: andl $15, %edx
> -; AVX1-NEXT: shlq $40, %rdx
> -; AVX1-NEXT: orq %rcx, %rdx
> -; AVX1-NEXT: movq %r9, %rcx
> -; AVX1-NEXT: shrq $40, %rcx
> -; AVX1-NEXT: andl $15, %ecx
> -; AVX1-NEXT: shrq $48, %rdi
> -; AVX1-NEXT: andl $15, %edi
> -; AVX1-NEXT: shlq $48, %rdi
> -; AVX1-NEXT: orq %rdx, %rdi
> -; AVX1-NEXT: movq %r9, %rdx
> -; AVX1-NEXT: shrq $32, %rdx
> -; AVX1-NEXT: andl $15, %edx
> -; AVX1-NEXT: shrq $56, %rsi
> -; AVX1-NEXT: andl $15, %esi
> -; AVX1-NEXT: shlq $56, %rsi
> -; AVX1-NEXT: orq %rdi, %rsi
> -; AVX1-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
> -; AVX1-NEXT: shlq $32, %rdx
> -; AVX1-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F
> -; AVX1-NEXT: orq %rdx, %r9
> -; AVX1-NEXT: shlq $40, %rcx
> -; AVX1-NEXT: orq %r9, %rcx
> -; AVX1-NEXT: shlq $48, %rax
> -; AVX1-NEXT: orq %rcx, %rax
> -; AVX1-NEXT: shlq $56, %r8
> -; AVX1-NEXT: orq %rax, %r8
> -; AVX1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
> -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
> -; AVX1-NEXT: vmovq %xmm0, %rax
> -; AVX1-NEXT: movq %rax, %r8
> -; AVX1-NEXT: movq %rax, %r9
> +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax
> +; AVX1-NEXT: movq %rax, %rcx
> +; AVX1-NEXT: movq %rax, %rdx
> ; AVX1-NEXT: movq %rax, %rsi
> ; AVX1-NEXT: movq %rax, %rdi
> -; AVX1-NEXT: movl %eax, %ecx
> -; AVX1-NEXT: movl %eax, %edx
> -; AVX1-NEXT: vmovd %eax, %xmm1
> -; AVX1-NEXT: shrl $8, %eax
> -; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
> -; AVX1-NEXT: shrl $16, %edx
> -; AVX1-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
> -; AVX1-NEXT: shrl $24, %ecx
> -; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
> ; AVX1-NEXT: shrq $32, %rdi
> -; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
> +; AVX1-NEXT: andl $15, %edi
> +; AVX1-NEXT: shlq $32, %rdi
> +; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
> +; AVX1-NEXT: orq %rdi, %rax
> +; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
> ; AVX1-NEXT: shrq $40, %rsi
> -; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
> -; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
> -; AVX1-NEXT: shrq $48, %r9
> -; AVX1-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1
> -; AVX1-NEXT: vpextrq $1, %xmm0, %rax
> -; AVX1-NEXT: shrq $56, %r8
> -; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
> -; AVX1-NEXT: movl %eax, %ecx
> -; AVX1-NEXT: shrl $8, %ecx
> -; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
> -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
> -; AVX1-NEXT: movl %eax, %ecx
> -; AVX1-NEXT: shrl $16, %ecx
> -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
> -; AVX1-NEXT: movl %eax, %ecx
> -; AVX1-NEXT: shrl $24, %ecx
> -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
> -; AVX1-NEXT: movq %rax, %rcx
> +; AVX1-NEXT: andl $15, %esi
> +; AVX1-NEXT: shlq $40, %rsi
> +; AVX1-NEXT: orq %rax, %rsi
> +; AVX1-NEXT: movq %rdi, %rax
> +; AVX1-NEXT: shrq $48, %rdx
> +; AVX1-NEXT: andl $15, %edx
> +; AVX1-NEXT: shlq $48, %rdx
> +; AVX1-NEXT: orq %rsi, %rdx
> +; AVX1-NEXT: movq %rdi, %rsi
> +; AVX1-NEXT: shrq $56, %rcx
> +; AVX1-NEXT: andl $15, %ecx
> +; AVX1-NEXT: shlq $56, %rcx
> +; AVX1-NEXT: orq %rdx, %rcx
> +; AVX1-NEXT: movq %rdi, %rdx
> +; AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
> +; AVX1-NEXT: movq %rdi, %rcx
> ; AVX1-NEXT: shrq $32, %rcx
> -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
> -; AVX1-NEXT: movq %rax, %rcx
> -; AVX1-NEXT: shrq $40, %rcx
> -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
> -; AVX1-NEXT: movq %rax, %rcx
> -; AVX1-NEXT: shrq $48, %rcx
> -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
> -; AVX1-NEXT: vmovq %xmm2, %rcx
> +; AVX1-NEXT: andl $15, %ecx
> +; AVX1-NEXT: shlq $32, %rcx
> +; AVX1-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
> +; AVX1-NEXT: orq %rcx, %rdi
> +; AVX1-NEXT: shrq $40, %rdx
> +; AVX1-NEXT: andl $15, %edx
> +; AVX1-NEXT: shlq $40, %rdx
> +; AVX1-NEXT: orq %rdi, %rdx
> +; AVX1-NEXT: shrq $48, %rsi
> +; AVX1-NEXT: andl $15, %esi
> +; AVX1-NEXT: shlq $48, %rsi
> +; AVX1-NEXT: orq %rdx, %rsi
> ; AVX1-NEXT: shrq $56, %rax
> -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
> +; AVX1-NEXT: andl $15, %eax
> +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
> +; AVX1-NEXT: shlq $56, %rax
> +; AVX1-NEXT: orq %rsi, %rax
> +; AVX1-NEXT: vmovq %xmm0, %rcx
> +; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
> ; AVX1-NEXT: movl %ecx, %eax
> ; AVX1-NEXT: shrl $8, %eax
> ; AVX1-NEXT: vmovd %ecx, %xmm1
> @@ -1097,129 +1052,85 @@ define <32 x i8> @_clearupper32xi8b(<32
> ; AVX1-NEXT: movq %rcx, %rax
> ; AVX1-NEXT: shrq $48, %rax
> ; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
> -; AVX1-NEXT: vpextrq $1, %xmm2, %rax
> +; AVX1-NEXT: vpextrq $1, %xmm0, %rax
> ; AVX1-NEXT: shrq $56, %rcx
> -; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
> ; AVX1-NEXT: movl %eax, %ecx
> ; AVX1-NEXT: shrl $8, %ecx
> -; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
> -; AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
> +; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
> ; AVX1-NEXT: movl %eax, %ecx
> ; AVX1-NEXT: shrl $16, %ecx
> -; AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
> ; AVX1-NEXT: movl %eax, %ecx
> ; AVX1-NEXT: shrl $24, %ecx
> -; AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
> ; AVX1-NEXT: movq %rax, %rcx
> ; AVX1-NEXT: shrq $32, %rcx
> -; AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
> ; AVX1-NEXT: movq %rax, %rcx
> ; AVX1-NEXT: shrq $40, %rcx
> -; AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
> ; AVX1-NEXT: movq %rax, %rcx
> ; AVX1-NEXT: shrq $48, %rcx
> -; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
> ; AVX1-NEXT: shrq $56, %rax
> -; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
> +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
> +; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm1
> ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
> ; AVX1-NEXT: retq
> ;
> ; AVX2-LABEL: _clearupper32xi8b:
> ; AVX2: # %bb.0:
> ; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
> -; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r9
> -; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
> -; AVX2-NEXT: movq %r9, %r8
> -; AVX2-NEXT: shrq $56, %r8
> -; AVX2-NEXT: andl $15, %r8d
> -; AVX2-NEXT: movq %rcx, %rsi
> -; AVX2-NEXT: movq %rcx, %rdi
> -; AVX2-NEXT: movq %rcx, %rdx
> -; AVX2-NEXT: movq %rcx, %rax
> -; AVX2-NEXT: shrq $32, %rax
> -; AVX2-NEXT: andl $15, %eax
> -; AVX2-NEXT: shlq $32, %rax
> -; AVX2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
> -; AVX2-NEXT: orq %rax, %rcx
> -; AVX2-NEXT: movq %r9, %rax
> -; AVX2-NEXT: shrq $48, %rax
> -; AVX2-NEXT: andl $15, %eax
> -; AVX2-NEXT: shrq $40, %rdx
> -; AVX2-NEXT: andl $15, %edx
> -; AVX2-NEXT: shlq $40, %rdx
> -; AVX2-NEXT: orq %rcx, %rdx
> -; AVX2-NEXT: movq %r9, %rcx
> -; AVX2-NEXT: shrq $40, %rcx
> -; AVX2-NEXT: andl $15, %ecx
> -; AVX2-NEXT: shrq $48, %rdi
> -; AVX2-NEXT: andl $15, %edi
> -; AVX2-NEXT: shlq $48, %rdi
> -; AVX2-NEXT: orq %rdx, %rdi
> -; AVX2-NEXT: movq %r9, %rdx
> -; AVX2-NEXT: shrq $32, %rdx
> -; AVX2-NEXT: andl $15, %edx
> -; AVX2-NEXT: shrq $56, %rsi
> -; AVX2-NEXT: andl $15, %esi
> -; AVX2-NEXT: shlq $56, %rsi
> -; AVX2-NEXT: orq %rdi, %rsi
> -; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
> -; AVX2-NEXT: shlq $32, %rdx
> -; AVX2-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F
> -; AVX2-NEXT: orq %rdx, %r9
> -; AVX2-NEXT: shlq $40, %rcx
> -; AVX2-NEXT: orq %r9, %rcx
> -; AVX2-NEXT: shlq $48, %rax
> -; AVX2-NEXT: orq %rcx, %rax
> -; AVX2-NEXT: shlq $56, %r8
> -; AVX2-NEXT: orq %rax, %r8
> -; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
> -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
> -; AVX2-NEXT: vmovq %xmm0, %rax
> -; AVX2-NEXT: movq %rax, %r8
> -; AVX2-NEXT: movq %rax, %r9
> +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax
> +; AVX2-NEXT: movq %rax, %rcx
> +; AVX2-NEXT: movq %rax, %rdx
> ; AVX2-NEXT: movq %rax, %rsi
> ; AVX2-NEXT: movq %rax, %rdi
> -; AVX2-NEXT: movl %eax, %ecx
> -; AVX2-NEXT: movl %eax, %edx
> -; AVX2-NEXT: vmovd %eax, %xmm1
> -; AVX2-NEXT: shrl $8, %eax
> -; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
> -; AVX2-NEXT: shrl $16, %edx
> -; AVX2-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
> -; AVX2-NEXT: shrl $24, %ecx
> -; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
> ; AVX2-NEXT: shrq $32, %rdi
> -; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
> +; AVX2-NEXT: andl $15, %edi
> +; AVX2-NEXT: shlq $32, %rdi
> +; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
> +; AVX2-NEXT: orq %rdi, %rax
> +; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
> ; AVX2-NEXT: shrq $40, %rsi
> -; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
> -; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
> -; AVX2-NEXT: shrq $48, %r9
> -; AVX2-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrq $1, %xmm0, %rax
> -; AVX2-NEXT: shrq $56, %r8
> -; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
> -; AVX2-NEXT: movl %eax, %ecx
> -; AVX2-NEXT: shrl $8, %ecx
> -; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
> -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
> -; AVX2-NEXT: movl %eax, %ecx
> -; AVX2-NEXT: shrl $16, %ecx
> -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
> -; AVX2-NEXT: movl %eax, %ecx
> -; AVX2-NEXT: shrl $24, %ecx
> -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
> -; AVX2-NEXT: movq %rax, %rcx
> +; AVX2-NEXT: andl $15, %esi
> +; AVX2-NEXT: shlq $40, %rsi
> +; AVX2-NEXT: orq %rax, %rsi
> +; AVX2-NEXT: movq %rdi, %rax
> +; AVX2-NEXT: shrq $48, %rdx
> +; AVX2-NEXT: andl $15, %edx
> +; AVX2-NEXT: shlq $48, %rdx
> +; AVX2-NEXT: orq %rsi, %rdx
> +; AVX2-NEXT: movq %rdi, %rsi
> +; AVX2-NEXT: shrq $56, %rcx
> +; AVX2-NEXT: andl $15, %ecx
> +; AVX2-NEXT: shlq $56, %rcx
> +; AVX2-NEXT: orq %rdx, %rcx
> +; AVX2-NEXT: movq %rdi, %rdx
> +; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
> +; AVX2-NEXT: movq %rdi, %rcx
> ; AVX2-NEXT: shrq $32, %rcx
> -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
> -; AVX2-NEXT: movq %rax, %rcx
> -; AVX2-NEXT: shrq $40, %rcx
> -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
> -; AVX2-NEXT: movq %rax, %rcx
> -; AVX2-NEXT: shrq $48, %rcx
> -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
> -; AVX2-NEXT: vmovq %xmm2, %rcx
> +; AVX2-NEXT: andl $15, %ecx
> +; AVX2-NEXT: shlq $32, %rcx
> +; AVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
> +; AVX2-NEXT: orq %rcx, %rdi
> +; AVX2-NEXT: shrq $40, %rdx
> +; AVX2-NEXT: andl $15, %edx
> +; AVX2-NEXT: shlq $40, %rdx
> +; AVX2-NEXT: orq %rdi, %rdx
> +; AVX2-NEXT: shrq $48, %rsi
> +; AVX2-NEXT: andl $15, %esi
> +; AVX2-NEXT: shlq $48, %rsi
> +; AVX2-NEXT: orq %rdx, %rsi
> ; AVX2-NEXT: shrq $56, %rax
> -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
> +; AVX2-NEXT: andl $15, %eax
> +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
> +; AVX2-NEXT: shlq $56, %rax
> +; AVX2-NEXT: orq %rsi, %rax
> +; AVX2-NEXT: vmovq %xmm0, %rcx
> +; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
> ; AVX2-NEXT: movl %ecx, %eax
> ; AVX2-NEXT: shrl $8, %eax
> ; AVX2-NEXT: vmovd %ecx, %xmm1
> @@ -1239,30 +1150,31 @@ define <32 x i8> @_clearupper32xi8b(<32
> ; AVX2-NEXT: movq %rcx, %rax
> ; AVX2-NEXT: shrq $48, %rax
> ; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
> -; AVX2-NEXT: vpextrq $1, %xmm2, %rax
> +; AVX2-NEXT: vpextrq $1, %xmm0, %rax
> ; AVX2-NEXT: shrq $56, %rcx
> -; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
> ; AVX2-NEXT: movl %eax, %ecx
> ; AVX2-NEXT: shrl $8, %ecx
> -; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
> -; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
> +; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
> ; AVX2-NEXT: movl %eax, %ecx
> ; AVX2-NEXT: shrl $16, %ecx
> -; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
> ; AVX2-NEXT: movl %eax, %ecx
> ; AVX2-NEXT: shrl $24, %ecx
> -; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
> ; AVX2-NEXT: movq %rax, %rcx
> ; AVX2-NEXT: shrq $32, %rcx
> -; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
> ; AVX2-NEXT: movq %rax, %rcx
> ; AVX2-NEXT: shrq $40, %rcx
> -; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
> ; AVX2-NEXT: movq %rax, %rcx
> ; AVX2-NEXT: shrq $48, %rcx
> -; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
> ; AVX2-NEXT: shrq $56, %rax
> -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
> +; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
> +; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
> ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
> ; AVX2-NEXT: retq
> %x4 = bitcast <32 x i8> %0 to <64 x i4>
>
> Modified: llvm/trunk/test/CodeGen/X86/load-partial.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/load-partial.ll?rev=366441&r1=366440&r2=366441&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/load-partial.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/load-partial.ll Thu Jul 18 07:33:25 2019
> @@ -54,32 +54,14 @@ define <8 x float> @load_float8_float3(<
> }
>
> define <4 x float> @load_float4_float3_as_float2_float(<4 x float>*
> nocapture readonly dereferenceable(16)) {
> -; SSE2-LABEL: load_float4_float3_as_float2_float:
> -; SSE2: # %bb.0:
> -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
> -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
> -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
> -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
> -; SSE2-NEXT: retq
> -;
> -; SSSE3-LABEL: load_float4_float3_as_float2_float:
> -; SSSE3: # %bb.0:
> -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
> -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
> -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
> -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
> -; SSSE3-NEXT: retq
> -;
> -; SSE41-LABEL: load_float4_float3_as_float2_float:
> -; SSE41: # %bb.0:
> -; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
> -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
> -; SSE41-NEXT: retq
> +; SSE-LABEL: load_float4_float3_as_float2_float:
> +; SSE: # %bb.0:
> +; SSE-NEXT: movups (%rdi), %xmm0
> +; SSE-NEXT: retq
> ;
> ; AVX-LABEL: load_float4_float3_as_float2_float:
> ; AVX: # %bb.0:
> -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
> -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
> +; AVX-NEXT: vmovups (%rdi), %xmm0
> ; AVX-NEXT: retq
> %2 = bitcast <4 x float>* %0 to <2 x float>*
> %3 = load <2 x float>, <2 x float>* %2, align 4
> @@ -94,36 +76,14 @@ define <4 x float> @load_float4_float3_a
> }
>
> define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture
> readonly dereferenceable(16)) {
> -; SSE2-LABEL: load_float4_float3_trunc:
> -; SSE2: # %bb.0:
> -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
> -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
> -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
> -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> -; SSE2-NEXT: retq
> -;
> -; SSSE3-LABEL: load_float4_float3_trunc:
> -; SSSE3: # %bb.0:
> -; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
> -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
> -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
> -; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
> -; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
> -; SSSE3-NEXT: retq
> -;
> -; SSE41-LABEL: load_float4_float3_trunc:
> -; SSE41: # %bb.0:
> -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
> -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
> -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
> -; SSE41-NEXT: retq
> +; SSE-LABEL: load_float4_float3_trunc:
> +; SSE: # %bb.0:
> +; SSE-NEXT: movaps (%rdi), %xmm0
> +; SSE-NEXT: retq
> ;
> ; AVX-LABEL: load_float4_float3_trunc:
> ; AVX: # %bb.0:
> -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
> -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
> -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
> +; AVX-NEXT: vmovaps (%rdi), %xmm0
> ; AVX-NEXT: retq
> %2 = bitcast <4 x float>* %0 to i64*
> %3 = load i64, i64* %2, align 16
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20190718/3563d90e/attachment.html>
More information about the llvm-commits
mailing list