[llvm] r366501 - Revert [X86] EltsFromConsecutiveLoads - support common source loads
Reid Kleckner via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 18 14:26:42 PDT 2019
Author: rnk
Date: Thu Jul 18 14:26:41 2019
New Revision: 366501
URL: http://llvm.org/viewvc/llvm-project?rev=366501&view=rev
Log:
Revert [X86] EltsFromConsecutiveLoads - support common source loads
This reverts r366441 (git commit 48104ef7c9c653bbb732b66d7254957389fea337)
This causes clang to fail to compile some file in Skia. Reduction soon.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
llvm/trunk/test/CodeGen/X86/load-partial.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=366501&r1=366500&r2=366501&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Jul 18 14:26:41 2019
@@ -7504,46 +7504,6 @@ static SDValue LowerAsSplatVectorLoad(SD
return SDValue();
}
-// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
-static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
- if (ISD::isNON_EXTLoad(Elt.getNode())) {
- Ld = cast<LoadSDNode>(Elt);
- ByteOffset = 0;
- return true;
- }
-
- switch (Elt.getOpcode()) {
- case ISD::BITCAST:
- case ISD::TRUNCATE:
- case ISD::SCALAR_TO_VECTOR:
- return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
- case ISD::SRL:
- if (isa<ConstantSDNode>(Elt.getOperand(1))) {
- uint64_t Idx = Elt.getConstantOperandVal(1);
- if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
- ByteOffset += Idx / 8;
- return true;
- }
- }
- break;
- case ISD::EXTRACT_VECTOR_ELT:
- if (isa<ConstantSDNode>(Elt.getOperand(1))) {
- SDValue Src = Elt.getOperand(0);
- unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
- unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
- if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
- findEltLoadSrc(Src, Ld, ByteOffset)) {
- uint64_t Idx = Elt.getConstantOperandVal(1);
- ByteOffset += Idx * (SrcSizeInBits / 8);
- return true;
- }
- }
- break;
- }
-
- return false;
-}
-
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
/// elements can be replaced by a single large load which has the same value as
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
@@ -7561,7 +7521,6 @@ static SDValue EltsFromConsecutiveLoads(
APInt UndefMask = APInt::getNullValue(NumElems);
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
- SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
// For each element in the initializer, see if we've found a load, zero or an
// undef.
@@ -7580,17 +7539,13 @@ static SDValue EltsFromConsecutiveLoads(
// Each loaded element must be the correct fractional portion of the
// requested vector load.
- unsigned EltSizeInBits = Elt.getValueSizeInBits();
- if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
+ if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
return SDValue();
- if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]))
+ if (!ISD::isNON_EXTLoad(Elt.getNode()))
return SDValue();
- assert(0 <= ByteOffsets[i] &&
- ((ByteOffsets[i] * 8) + EltSizeInBits) <=
- Loads[i]->getValueSizeInBits(0) &&
- "Element offset outside of load bounds");
+ Loads[i] = cast<LoadSDNode>(Elt);
LoadMask.setBit(i);
LastLoadedElt = i;
}
@@ -7620,20 +7575,6 @@ static SDValue EltsFromConsecutiveLoads(
int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
- // Check to see if the element's load is consecutive to the base load
- // or offset from a previous (already checked) load.
- auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
- LoadSDNode *Ld = Loads[EltIdx];
- int64_t ByteOffset = ByteOffsets[EltIdx];
- if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
- int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
- return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
- Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
- }
- return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
- EltIdx - FirstLoadedElt);
- };
-
// Consecutive loads can contain UNDEFS but not ZERO elements.
// Consecutive loads with UNDEFs and ZEROs elements require a
// an additional shuffle stage to clear the ZERO elements.
@@ -7641,7 +7582,8 @@ static SDValue EltsFromConsecutiveLoads(
bool IsConsecutiveLoadWithZeros = true;
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
if (LoadMask[i]) {
- if (!CheckConsecutiveLoad(LDBase, i)) {
+ if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,
+ i - FirstLoadedElt)) {
IsConsecutiveLoad = false;
IsConsecutiveLoadWithZeros = false;
break;
Modified: llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll?rev=366501&r1=366500&r2=366501&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll (original)
+++ llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll Thu Jul 18 14:26:41 2019
@@ -985,54 +985,99 @@ define <32 x i8> @_clearupper32xi8b(<32
; AVX1-LABEL: _clearupper32xi8b:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: movq %rax, %rdx
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r9
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX1-NEXT: movq %r9, %r8
+; AVX1-NEXT: shrq $56, %r8
+; AVX1-NEXT: andl $15, %r8d
+; AVX1-NEXT: movq %rcx, %rsi
+; AVX1-NEXT: movq %rcx, %rdi
+; AVX1-NEXT: movq %rcx, %rdx
+; AVX1-NEXT: movq %rcx, %rax
+; AVX1-NEXT: shrq $32, %rax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: shlq $32, %rax
+; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
+; AVX1-NEXT: orq %rax, %rcx
+; AVX1-NEXT: movq %r9, %rax
+; AVX1-NEXT: shrq $48, %rax
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: shrq $40, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shlq $40, %rdx
+; AVX1-NEXT: orq %rcx, %rdx
+; AVX1-NEXT: movq %r9, %rcx
+; AVX1-NEXT: shrq $40, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shrq $48, %rdi
+; AVX1-NEXT: andl $15, %edi
+; AVX1-NEXT: shlq $48, %rdi
+; AVX1-NEXT: orq %rdx, %rdi
+; AVX1-NEXT: movq %r9, %rdx
+; AVX1-NEXT: shrq $32, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shrq $56, %rsi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shlq $56, %rsi
+; AVX1-NEXT: orq %rdi, %rsi
+; AVX1-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: shlq $32, %rdx
+; AVX1-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F
+; AVX1-NEXT: orq %rdx, %r9
+; AVX1-NEXT: shlq $40, %rcx
+; AVX1-NEXT: orq %r9, %rcx
+; AVX1-NEXT: shlq $48, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: shlq $56, %r8
+; AVX1-NEXT: orq %rax, %r8
+; AVX1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq %rax, %r9
; AVX1-NEXT: movq %rax, %rsi
; AVX1-NEXT: movq %rax, %rdi
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: movl %eax, %edx
+; AVX1-NEXT: vmovd %eax, %xmm1
+; AVX1-NEXT: shrl $8, %eax
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT: shrl $16, %edx
+; AVX1-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
+; AVX1-NEXT: shrl $24, %ecx
+; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
; AVX1-NEXT: shrq $32, %rdi
-; AVX1-NEXT: andl $15, %edi
-; AVX1-NEXT: shlq $32, %rdi
-; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; AVX1-NEXT: orq %rdi, %rax
-; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
+; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; AVX1-NEXT: shrq $40, %rsi
-; AVX1-NEXT: andl $15, %esi
-; AVX1-NEXT: shlq $40, %rsi
-; AVX1-NEXT: orq %rax, %rsi
-; AVX1-NEXT: movq %rdi, %rax
-; AVX1-NEXT: shrq $48, %rdx
-; AVX1-NEXT: andl $15, %edx
-; AVX1-NEXT: shlq $48, %rdx
-; AVX1-NEXT: orq %rsi, %rdx
-; AVX1-NEXT: movq %rdi, %rsi
-; AVX1-NEXT: shrq $56, %rcx
-; AVX1-NEXT: andl $15, %ecx
-; AVX1-NEXT: shlq $56, %rcx
-; AVX1-NEXT: orq %rdx, %rcx
-; AVX1-NEXT: movq %rdi, %rdx
-; AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %rdi, %rcx
+; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX1-NEXT: shrq $48, %r9
+; AVX1-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: shrq $56, %r8
+; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $24, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: andl $15, %ecx
-; AVX1-NEXT: shlq $32, %rcx
-; AVX1-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
-; AVX1-NEXT: orq %rcx, %rdi
-; AVX1-NEXT: shrq $40, %rdx
-; AVX1-NEXT: andl $15, %edx
-; AVX1-NEXT: shlq $40, %rdx
-; AVX1-NEXT: orq %rdi, %rdx
-; AVX1-NEXT: shrq $48, %rsi
-; AVX1-NEXT: andl $15, %esi
-; AVX1-NEXT: shlq $48, %rsi
-; AVX1-NEXT: orq %rdx, %rsi
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $40, %rcx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vmovq %xmm2, %rcx
; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: andl $15, %eax
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: shlq $56, %rax
-; AVX1-NEXT: orq %rsi, %rax
-; AVX1-NEXT: vmovq %xmm0, %rcx
-; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: movl %ecx, %eax
; AVX1-NEXT: shrl $8, %eax
; AVX1-NEXT: vmovd %ecx, %xmm1
@@ -1052,85 +1097,129 @@ define <32 x i8> @_clearupper32xi8b(<32
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: shrq $48, %rax
; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: vpextrq $1, %xmm2, %rax
; AVX1-NEXT: shrq $56, %rcx
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
; AVX1-NEXT: movl %eax, %ecx
; AVX1-NEXT: shrl $24, %ecx
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq $40, %rcx
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm1
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: _clearupper32xi8b:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: movq %rax, %rdx
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT: movq %r9, %r8
+; AVX2-NEXT: shrq $56, %r8
+; AVX2-NEXT: andl $15, %r8d
+; AVX2-NEXT: movq %rcx, %rsi
+; AVX2-NEXT: movq %rcx, %rdi
+; AVX2-NEXT: movq %rcx, %rdx
+; AVX2-NEXT: movq %rcx, %rax
+; AVX2-NEXT: shrq $32, %rax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: shlq $32, %rax
+; AVX2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: movq %r9, %rax
+; AVX2-NEXT: shrq $48, %rax
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: shrq $40, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: movq %r9, %rcx
+; AVX2-NEXT: shrq $40, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shrq $48, %rdi
+; AVX2-NEXT: andl $15, %edi
+; AVX2-NEXT: shlq $48, %rdi
+; AVX2-NEXT: orq %rdx, %rdi
+; AVX2-NEXT: movq %r9, %rdx
+; AVX2-NEXT: shrq $32, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shrq $56, %rsi
+; AVX2-NEXT: andl $15, %esi
+; AVX2-NEXT: shlq $56, %rsi
+; AVX2-NEXT: orq %rdi, %rsi
+; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: shlq $32, %rdx
+; AVX2-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F
+; AVX2-NEXT: orq %rdx, %r9
+; AVX2-NEXT: shlq $40, %rcx
+; AVX2-NEXT: orq %r9, %rcx
+; AVX2-NEXT: shlq $48, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: shlq $56, %r8
+; AVX2-NEXT: orq %rax, %r8
+; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq %rax, %r9
; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: movq %rax, %rdi
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: movl %eax, %edx
+; AVX2-NEXT: vmovd %eax, %xmm1
+; AVX2-NEXT: shrl $8, %eax
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT: shrl $16, %edx
+; AVX2-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
+; AVX2-NEXT: shrl $24, %ecx
+; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
; AVX2-NEXT: shrq $32, %rdi
-; AVX2-NEXT: andl $15, %edi
-; AVX2-NEXT: shlq $32, %rdi
-; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; AVX2-NEXT: orq %rdi, %rax
-; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1
; AVX2-NEXT: shrq $40, %rsi
-; AVX2-NEXT: andl $15, %esi
-; AVX2-NEXT: shlq $40, %rsi
-; AVX2-NEXT: orq %rax, %rsi
-; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: shrq $48, %rdx
-; AVX2-NEXT: andl $15, %edx
-; AVX2-NEXT: shlq $48, %rdx
-; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: shrq $56, %rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: shlq $56, %rcx
-; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: movq %rdi, %rdx
-; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rdi, %rcx
+; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX2-NEXT: shrq $48, %r9
+; AVX2-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: shrq $56, %r8
+; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $8, %ecx
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $24, %ecx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: shlq $32, %rcx
-; AVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
-; AVX2-NEXT: orq %rcx, %rdi
-; AVX2-NEXT: shrq $40, %rdx
-; AVX2-NEXT: andl $15, %edx
-; AVX2-NEXT: shlq $40, %rdx
-; AVX2-NEXT: orq %rdi, %rdx
-; AVX2-NEXT: shrq $48, %rsi
-; AVX2-NEXT: andl $15, %esi
-; AVX2-NEXT: shlq $48, %rsi
-; AVX2-NEXT: orq %rdx, %rsi
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $40, %rcx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm2, %rcx
; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: andl $15, %eax
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: shlq $56, %rax
-; AVX2-NEXT: orq %rsi, %rax
-; AVX2-NEXT: vmovq %xmm0, %rcx
-; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: shrl $8, %eax
; AVX2-NEXT: vmovd %ecx, %xmm1
@@ -1150,31 +1239,30 @@ define <32 x i8> @_clearupper32xi8b(<32
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: shrq $48, %rax
; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
; AVX2-NEXT: shrq $56, %rcx
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $8, %ecx
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: shrl $24, %ecx
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq $40, %rcx
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1
; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
%x4 = bitcast <32 x i8> %0 to <64 x i4>
Modified: llvm/trunk/test/CodeGen/X86/load-partial.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/load-partial.ll?rev=366501&r1=366500&r2=366501&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/load-partial.ll (original)
+++ llvm/trunk/test/CodeGen/X86/load-partial.ll Thu Jul 18 14:26:41 2019
@@ -54,14 +54,32 @@ define <8 x float> @load_float8_float3(<
}
define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE-LABEL: load_float4_float3_as_float2_float:
-; SSE: # %bb.0:
-; SSE-NEXT: movups (%rdi), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: load_float4_float3_as_float2_float:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_float4_float3_as_float2_float:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_float4_float3_as_float2_float:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: retq
;
; AVX-LABEL: load_float4_float3_as_float2_float:
; AVX: # %bb.0:
-; AVX-NEXT: vmovups (%rdi), %xmm0
+; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
%2 = bitcast <4 x float>* %0 to <2 x float>*
%3 = load <2 x float>, <2 x float>* %2, align 4
@@ -76,14 +94,36 @@ define <4 x float> @load_float4_float3_a
}
define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE-LABEL: load_float4_float3_trunc:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: load_float4_float3_trunc:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: load_float4_float3_trunc:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: load_float4_float3_trunc:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT: retq
;
; AVX-LABEL: load_float4_float3_trunc:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
%2 = bitcast <4 x float>* %0 to i64*
%3 = load i64, i64* %2, align 16
More information about the llvm-commits
mailing list