<div dir="ltr">This caused some crashes building some vector code in skia:<div><a href="https://ci.chromium.org/p/chromium/builders/ci/ToTLinux/7145">https://ci.chromium.org/p/chromium/builders/ci/ToTLinux/7145</a> </div><div>I reverted and started creduce, but I'm leaving for the day soon. <br></div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Jul 18, 2019 at 7:33 AM Simon Pilgrim via llvm-commits <<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">Author: rksimon<br>
Date: Thu Jul 18 07:33:25 2019<br>
New Revision: 366441<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=366441&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project?rev=366441&view=rev</a><br>
Log:<br>
[X86] EltsFromConsecutiveLoads - support common source loads<br>
<br>
This patch enables us to find the source loads for each element, splitting them into a Load and ByteOffset, and attempts to recognise consecutive loads that are in fact from the same source load.<br>
<br>
A helper function, findEltLoadSrc, recurses to find a LoadSDNode and determines the element's byte offset within it. When attempting to match consecutive loads, byte offsetted loads then attempt to matched against a previous load that has already been confirmed to be a consecutive match.<br>
<br>
Next step towards PR16739 - after this we just need to account for shuffling/repeated elements to create a vector load + shuffle.<br>
<br>
Differential Revision: <a href="https://reviews.llvm.org/D64551" rel="noreferrer" target="_blank">https://reviews.llvm.org/D64551</a><br>
<br>
Modified:<br>
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp<br>
llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll<br>
llvm/trunk/test/CodeGen/X86/load-partial.ll<br>
<br>
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=366441&r1=366440&r2=366441&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=366441&r1=366440&r2=366441&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)<br>
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Jul 18 07:33:25 2019<br>
@@ -7504,6 +7504,46 @@ static SDValue LowerAsSplatVectorLoad(SD<br>
return SDValue();<br>
}<br>
<br>
+// Recurse to find a LoadSDNode source and the accumulated ByteOffest.<br>
+static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {<br>
+ if (ISD::isNON_EXTLoad(Elt.getNode())) {<br>
+ Ld = cast<LoadSDNode>(Elt);<br>
+ ByteOffset = 0;<br>
+ return true;<br>
+ }<br>
+<br>
+ switch (Elt.getOpcode()) {<br>
+ case ISD::BITCAST:<br>
+ case ISD::TRUNCATE:<br>
+ case ISD::SCALAR_TO_VECTOR:<br>
+ return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);<br>
+ case ISD::SRL:<br>
+ if (isa<ConstantSDNode>(Elt.getOperand(1))) {<br>
+ uint64_t Idx = Elt.getConstantOperandVal(1);<br>
+ if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {<br>
+ ByteOffset += Idx / 8;<br>
+ return true;<br>
+ }<br>
+ }<br>
+ break;<br>
+ case ISD::EXTRACT_VECTOR_ELT:<br>
+ if (isa<ConstantSDNode>(Elt.getOperand(1))) {<br>
+ SDValue Src = Elt.getOperand(0);<br>
+ unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();<br>
+ unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();<br>
+ if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&<br>
+ findEltLoadSrc(Src, Ld, ByteOffset)) {<br>
+ uint64_t Idx = Elt.getConstantOperandVal(1);<br>
+ ByteOffset += Idx * (SrcSizeInBits / 8);<br>
+ return true;<br>
+ }<br>
+ }<br>
+ break;<br>
+ }<br>
+<br>
+ return false;<br>
+}<br>
+<br>
/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the<br>
/// elements can be replaced by a single large load which has the same value as<br>
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.<br>
@@ -7521,6 +7561,7 @@ static SDValue EltsFromConsecutiveLoads(<br>
APInt UndefMask = APInt::getNullValue(NumElems);<br>
<br>
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);<br>
+ SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);<br>
<br>
// For each element in the initializer, see if we've found a load, zero or an<br>
// undef.<br>
@@ -7539,13 +7580,17 @@ static SDValue EltsFromConsecutiveLoads(<br>
<br>
// Each loaded element must be the correct fractional portion of the<br>
// requested vector load.<br>
- if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())<br>
+ unsigned EltSizeInBits = Elt.getValueSizeInBits();<br>
+ if ((NumElems * EltSizeInBits) != VT.getSizeInBits())<br>
return SDValue();<br>
<br>
- if (!ISD::isNON_EXTLoad(Elt.getNode()))<br>
+ if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]))<br>
return SDValue();<br>
+ assert(0 <= ByteOffsets[i] &&<br>
+ ((ByteOffsets[i] * 8) + EltSizeInBits) <=<br>
+ Loads[i]->getValueSizeInBits(0) &&<br>
+ "Element offset outside of load bounds");<br>
<br>
- Loads[i] = cast<LoadSDNode>(Elt);<br>
LoadMask.setBit(i);<br>
LastLoadedElt = i;<br>
}<br>
@@ -7575,6 +7620,20 @@ static SDValue EltsFromConsecutiveLoads(<br>
int LoadSizeInBits = (1 + LastLoadedElt - FirstLoadedElt) * BaseSizeInBits;<br>
assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");<br>
<br>
+ // Check to see if the element's load is consecutive to the base load<br>
+ // or offset from a previous (already checked) load.<br>
+ auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {<br>
+ LoadSDNode *Ld = Loads[EltIdx];<br>
+ int64_t ByteOffset = ByteOffsets[EltIdx];<br>
+ if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {<br>
+ int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);<br>
+ return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&<br>
+ Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);<br>
+ }<br>
+ return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,<br>
+ EltIdx - FirstLoadedElt);<br>
+ };<br>
+<br>
// Consecutive loads can contain UNDEFS but not ZERO elements.<br>
// Consecutive loads with UNDEFs and ZEROs elements require a<br>
// an additional shuffle stage to clear the ZERO elements.<br>
@@ -7582,8 +7641,7 @@ static SDValue EltsFromConsecutiveLoads(<br>
bool IsConsecutiveLoadWithZeros = true;<br>
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {<br>
if (LoadMask[i]) {<br>
- if (!DAG.areNonVolatileConsecutiveLoads(Loads[i], LDBase, BaseSizeInBytes,<br>
- i - FirstLoadedElt)) {<br>
+ if (!CheckConsecutiveLoad(LDBase, i)) {<br>
IsConsecutiveLoad = false;<br>
IsConsecutiveLoadWithZeros = false;<br>
break;<br>
<br>
Modified: llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll?rev=366441&r1=366440&r2=366441&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll?rev=366441&r1=366440&r2=366441&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll (original)<br>
+++ llvm/trunk/test/CodeGen/X86/clear_upper_vector_element_bits.ll Thu Jul 18 07:33:25 2019<br>
@@ -985,99 +985,54 @@ define <32 x i8> @_clearupper32xi8b(<32<br>
; AVX1-LABEL: _clearupper32xi8b:<br>
; AVX1: # %bb.0:<br>
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)<br>
-; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %r9<br>
-; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rcx<br>
-; AVX1-NEXT: movq %r9, %r8<br>
-; AVX1-NEXT: shrq $56, %r8<br>
-; AVX1-NEXT: andl $15, %r8d<br>
-; AVX1-NEXT: movq %rcx, %rsi<br>
-; AVX1-NEXT: movq %rcx, %rdi<br>
-; AVX1-NEXT: movq %rcx, %rdx<br>
-; AVX1-NEXT: movq %rcx, %rax<br>
-; AVX1-NEXT: shrq $32, %rax<br>
-; AVX1-NEXT: andl $15, %eax<br>
-; AVX1-NEXT: shlq $32, %rax<br>
-; AVX1-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F<br>
-; AVX1-NEXT: orq %rax, %rcx<br>
-; AVX1-NEXT: movq %r9, %rax<br>
-; AVX1-NEXT: shrq $48, %rax<br>
-; AVX1-NEXT: andl $15, %eax<br>
-; AVX1-NEXT: shrq $40, %rdx<br>
-; AVX1-NEXT: andl $15, %edx<br>
-; AVX1-NEXT: shlq $40, %rdx<br>
-; AVX1-NEXT: orq %rcx, %rdx<br>
-; AVX1-NEXT: movq %r9, %rcx<br>
-; AVX1-NEXT: shrq $40, %rcx<br>
-; AVX1-NEXT: andl $15, %ecx<br>
-; AVX1-NEXT: shrq $48, %rdi<br>
-; AVX1-NEXT: andl $15, %edi<br>
-; AVX1-NEXT: shlq $48, %rdi<br>
-; AVX1-NEXT: orq %rdx, %rdi<br>
-; AVX1-NEXT: movq %r9, %rdx<br>
-; AVX1-NEXT: shrq $32, %rdx<br>
-; AVX1-NEXT: andl $15, %edx<br>
-; AVX1-NEXT: shrq $56, %rsi<br>
-; AVX1-NEXT: andl $15, %esi<br>
-; AVX1-NEXT: shlq $56, %rsi<br>
-; AVX1-NEXT: orq %rdi, %rsi<br>
-; AVX1-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)<br>
-; AVX1-NEXT: shlq $32, %rdx<br>
-; AVX1-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F<br>
-; AVX1-NEXT: orq %rdx, %r9<br>
-; AVX1-NEXT: shlq $40, %rcx<br>
-; AVX1-NEXT: orq %r9, %rcx<br>
-; AVX1-NEXT: shlq $48, %rax<br>
-; AVX1-NEXT: orq %rcx, %rax<br>
-; AVX1-NEXT: shlq $56, %r8<br>
-; AVX1-NEXT: orq %rax, %r8<br>
-; AVX1-NEXT: movq %r8, -{{[0-9]+}}(%rsp)<br>
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0<br>
-; AVX1-NEXT: vmovq %xmm0, %rax<br>
-; AVX1-NEXT: movq %rax, %r8<br>
-; AVX1-NEXT: movq %rax, %r9<br>
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax<br>
+; AVX1-NEXT: movq %rax, %rcx<br>
+; AVX1-NEXT: movq %rax, %rdx<br>
; AVX1-NEXT: movq %rax, %rsi<br>
; AVX1-NEXT: movq %rax, %rdi<br>
-; AVX1-NEXT: movl %eax, %ecx<br>
-; AVX1-NEXT: movl %eax, %edx<br>
-; AVX1-NEXT: vmovd %eax, %xmm1<br>
-; AVX1-NEXT: shrl $8, %eax<br>
-; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1<br>
-; AVX1-NEXT: shrl $16, %edx<br>
-; AVX1-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1<br>
-; AVX1-NEXT: shrl $24, %ecx<br>
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1<br>
; AVX1-NEXT: shrq $32, %rdi<br>
-; AVX1-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1<br>
+; AVX1-NEXT: andl $15, %edi<br>
+; AVX1-NEXT: shlq $32, %rdi<br>
+; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F<br>
+; AVX1-NEXT: orq %rdi, %rax<br>
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdi<br>
; AVX1-NEXT: shrq $40, %rsi<br>
-; AVX1-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1<br>
-; AVX1-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2<br>
-; AVX1-NEXT: shrq $48, %r9<br>
-; AVX1-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1<br>
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax<br>
-; AVX1-NEXT: shrq $56, %r8<br>
-; AVX1-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0<br>
-; AVX1-NEXT: movl %eax, %ecx<br>
-; AVX1-NEXT: shrl $8, %ecx<br>
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0<br>
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0<br>
-; AVX1-NEXT: movl %eax, %ecx<br>
-; AVX1-NEXT: shrl $16, %ecx<br>
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0<br>
-; AVX1-NEXT: movl %eax, %ecx<br>
-; AVX1-NEXT: shrl $24, %ecx<br>
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0<br>
-; AVX1-NEXT: movq %rax, %rcx<br>
+; AVX1-NEXT: andl $15, %esi<br>
+; AVX1-NEXT: shlq $40, %rsi<br>
+; AVX1-NEXT: orq %rax, %rsi<br>
+; AVX1-NEXT: movq %rdi, %rax<br>
+; AVX1-NEXT: shrq $48, %rdx<br>
+; AVX1-NEXT: andl $15, %edx<br>
+; AVX1-NEXT: shlq $48, %rdx<br>
+; AVX1-NEXT: orq %rsi, %rdx<br>
+; AVX1-NEXT: movq %rdi, %rsi<br>
+; AVX1-NEXT: shrq $56, %rcx<br>
+; AVX1-NEXT: andl $15, %ecx<br>
+; AVX1-NEXT: shlq $56, %rcx<br>
+; AVX1-NEXT: orq %rdx, %rcx<br>
+; AVX1-NEXT: movq %rdi, %rdx<br>
+; AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)<br>
+; AVX1-NEXT: movq %rdi, %rcx<br>
; AVX1-NEXT: shrq $32, %rcx<br>
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0<br>
-; AVX1-NEXT: movq %rax, %rcx<br>
-; AVX1-NEXT: shrq $40, %rcx<br>
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0<br>
-; AVX1-NEXT: movq %rax, %rcx<br>
-; AVX1-NEXT: shrq $48, %rcx<br>
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0<br>
-; AVX1-NEXT: vmovq %xmm2, %rcx<br>
+; AVX1-NEXT: andl $15, %ecx<br>
+; AVX1-NEXT: shlq $32, %rcx<br>
+; AVX1-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F<br>
+; AVX1-NEXT: orq %rcx, %rdi<br>
+; AVX1-NEXT: shrq $40, %rdx<br>
+; AVX1-NEXT: andl $15, %edx<br>
+; AVX1-NEXT: shlq $40, %rdx<br>
+; AVX1-NEXT: orq %rdi, %rdx<br>
+; AVX1-NEXT: shrq $48, %rsi<br>
+; AVX1-NEXT: andl $15, %esi<br>
+; AVX1-NEXT: shlq $48, %rsi<br>
+; AVX1-NEXT: orq %rdx, %rsi<br>
; AVX1-NEXT: shrq $56, %rax<br>
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0<br>
+; AVX1-NEXT: andl $15, %eax<br>
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0<br>
+; AVX1-NEXT: shlq $56, %rax<br>
+; AVX1-NEXT: orq %rsi, %rax<br>
+; AVX1-NEXT: vmovq %xmm0, %rcx<br>
+; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)<br>
; AVX1-NEXT: movl %ecx, %eax<br>
; AVX1-NEXT: shrl $8, %eax<br>
; AVX1-NEXT: vmovd %ecx, %xmm1<br>
@@ -1097,129 +1052,85 @@ define <32 x i8> @_clearupper32xi8b(<32<br>
; AVX1-NEXT: movq %rcx, %rax<br>
; AVX1-NEXT: shrq $48, %rax<br>
; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1<br>
-; AVX1-NEXT: vpextrq $1, %xmm2, %rax<br>
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax<br>
; AVX1-NEXT: shrq $56, %rcx<br>
-; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0<br>
; AVX1-NEXT: movl %eax, %ecx<br>
; AVX1-NEXT: shrl $8, %ecx<br>
-; AVX1-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1<br>
-; AVX1-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0<br>
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0<br>
; AVX1-NEXT: movl %eax, %ecx<br>
; AVX1-NEXT: shrl $16, %ecx<br>
-; AVX1-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0<br>
; AVX1-NEXT: movl %eax, %ecx<br>
; AVX1-NEXT: shrl $24, %ecx<br>
-; AVX1-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0<br>
; AVX1-NEXT: movq %rax, %rcx<br>
; AVX1-NEXT: shrq $32, %rcx<br>
-; AVX1-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0<br>
; AVX1-NEXT: movq %rax, %rcx<br>
; AVX1-NEXT: shrq $40, %rcx<br>
-; AVX1-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0<br>
; AVX1-NEXT: movq %rax, %rcx<br>
; AVX1-NEXT: shrq $48, %rcx<br>
-; AVX1-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0<br>
; AVX1-NEXT: shrq $56, %rax<br>
-; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1<br>
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0<br>
+; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm1<br>
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0<br>
; AVX1-NEXT: retq<br>
;<br>
; AVX2-LABEL: _clearupper32xi8b:<br>
; AVX2: # %bb.0:<br>
; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)<br>
-; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %r9<br>
-; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rcx<br>
-; AVX2-NEXT: movq %r9, %r8<br>
-; AVX2-NEXT: shrq $56, %r8<br>
-; AVX2-NEXT: andl $15, %r8d<br>
-; AVX2-NEXT: movq %rcx, %rsi<br>
-; AVX2-NEXT: movq %rcx, %rdi<br>
-; AVX2-NEXT: movq %rcx, %rdx<br>
-; AVX2-NEXT: movq %rcx, %rax<br>
-; AVX2-NEXT: shrq $32, %rax<br>
-; AVX2-NEXT: andl $15, %eax<br>
-; AVX2-NEXT: shlq $32, %rax<br>
-; AVX2-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F<br>
-; AVX2-NEXT: orq %rax, %rcx<br>
-; AVX2-NEXT: movq %r9, %rax<br>
-; AVX2-NEXT: shrq $48, %rax<br>
-; AVX2-NEXT: andl $15, %eax<br>
-; AVX2-NEXT: shrq $40, %rdx<br>
-; AVX2-NEXT: andl $15, %edx<br>
-; AVX2-NEXT: shlq $40, %rdx<br>
-; AVX2-NEXT: orq %rcx, %rdx<br>
-; AVX2-NEXT: movq %r9, %rcx<br>
-; AVX2-NEXT: shrq $40, %rcx<br>
-; AVX2-NEXT: andl $15, %ecx<br>
-; AVX2-NEXT: shrq $48, %rdi<br>
-; AVX2-NEXT: andl $15, %edi<br>
-; AVX2-NEXT: shlq $48, %rdi<br>
-; AVX2-NEXT: orq %rdx, %rdi<br>
-; AVX2-NEXT: movq %r9, %rdx<br>
-; AVX2-NEXT: shrq $32, %rdx<br>
-; AVX2-NEXT: andl $15, %edx<br>
-; AVX2-NEXT: shrq $56, %rsi<br>
-; AVX2-NEXT: andl $15, %esi<br>
-; AVX2-NEXT: shlq $56, %rsi<br>
-; AVX2-NEXT: orq %rdi, %rsi<br>
-; AVX2-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)<br>
-; AVX2-NEXT: shlq $32, %rdx<br>
-; AVX2-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F<br>
-; AVX2-NEXT: orq %rdx, %r9<br>
-; AVX2-NEXT: shlq $40, %rcx<br>
-; AVX2-NEXT: orq %r9, %rcx<br>
-; AVX2-NEXT: shlq $48, %rax<br>
-; AVX2-NEXT: orq %rcx, %rax<br>
-; AVX2-NEXT: shlq $56, %r8<br>
-; AVX2-NEXT: orq %rax, %r8<br>
-; AVX2-NEXT: movq %r8, -{{[0-9]+}}(%rsp)<br>
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0<br>
-; AVX2-NEXT: vmovq %xmm0, %rax<br>
-; AVX2-NEXT: movq %rax, %r8<br>
-; AVX2-NEXT: movq %rax, %r9<br>
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax<br>
+; AVX2-NEXT: movq %rax, %rcx<br>
+; AVX2-NEXT: movq %rax, %rdx<br>
; AVX2-NEXT: movq %rax, %rsi<br>
; AVX2-NEXT: movq %rax, %rdi<br>
-; AVX2-NEXT: movl %eax, %ecx<br>
-; AVX2-NEXT: movl %eax, %edx<br>
-; AVX2-NEXT: vmovd %eax, %xmm1<br>
-; AVX2-NEXT: shrl $8, %eax<br>
-; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1<br>
-; AVX2-NEXT: shrl $16, %edx<br>
-; AVX2-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1<br>
-; AVX2-NEXT: shrl $24, %ecx<br>
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1<br>
; AVX2-NEXT: shrq $32, %rdi<br>
-; AVX2-NEXT: vpinsrb $4, %edi, %xmm1, %xmm1<br>
+; AVX2-NEXT: andl $15, %edi<br>
+; AVX2-NEXT: shlq $32, %rdi<br>
+; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F<br>
+; AVX2-NEXT: orq %rdi, %rax<br>
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdi<br>
; AVX2-NEXT: shrq $40, %rsi<br>
-; AVX2-NEXT: vpinsrb $5, %esi, %xmm1, %xmm1<br>
-; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm2<br>
-; AVX2-NEXT: shrq $48, %r9<br>
-; AVX2-NEXT: vpinsrb $6, %r9d, %xmm1, %xmm1<br>
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax<br>
-; AVX2-NEXT: shrq $56, %r8<br>
-; AVX2-NEXT: vpinsrb $7, %r8d, %xmm1, %xmm0<br>
-; AVX2-NEXT: movl %eax, %ecx<br>
-; AVX2-NEXT: shrl $8, %ecx<br>
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0<br>
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0<br>
-; AVX2-NEXT: movl %eax, %ecx<br>
-; AVX2-NEXT: shrl $16, %ecx<br>
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0<br>
-; AVX2-NEXT: movl %eax, %ecx<br>
-; AVX2-NEXT: shrl $24, %ecx<br>
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0<br>
-; AVX2-NEXT: movq %rax, %rcx<br>
+; AVX2-NEXT: andl $15, %esi<br>
+; AVX2-NEXT: shlq $40, %rsi<br>
+; AVX2-NEXT: orq %rax, %rsi<br>
+; AVX2-NEXT: movq %rdi, %rax<br>
+; AVX2-NEXT: shrq $48, %rdx<br>
+; AVX2-NEXT: andl $15, %edx<br>
+; AVX2-NEXT: shlq $48, %rdx<br>
+; AVX2-NEXT: orq %rsi, %rdx<br>
+; AVX2-NEXT: movq %rdi, %rsi<br>
+; AVX2-NEXT: shrq $56, %rcx<br>
+; AVX2-NEXT: andl $15, %ecx<br>
+; AVX2-NEXT: shlq $56, %rcx<br>
+; AVX2-NEXT: orq %rdx, %rcx<br>
+; AVX2-NEXT: movq %rdi, %rdx<br>
+; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)<br>
+; AVX2-NEXT: movq %rdi, %rcx<br>
; AVX2-NEXT: shrq $32, %rcx<br>
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0<br>
-; AVX2-NEXT: movq %rax, %rcx<br>
-; AVX2-NEXT: shrq $40, %rcx<br>
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0<br>
-; AVX2-NEXT: movq %rax, %rcx<br>
-; AVX2-NEXT: shrq $48, %rcx<br>
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0<br>
-; AVX2-NEXT: vmovq %xmm2, %rcx<br>
+; AVX2-NEXT: andl $15, %ecx<br>
+; AVX2-NEXT: shlq $32, %rcx<br>
+; AVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F<br>
+; AVX2-NEXT: orq %rcx, %rdi<br>
+; AVX2-NEXT: shrq $40, %rdx<br>
+; AVX2-NEXT: andl $15, %edx<br>
+; AVX2-NEXT: shlq $40, %rdx<br>
+; AVX2-NEXT: orq %rdi, %rdx<br>
+; AVX2-NEXT: shrq $48, %rsi<br>
+; AVX2-NEXT: andl $15, %esi<br>
+; AVX2-NEXT: shlq $48, %rsi<br>
+; AVX2-NEXT: orq %rdx, %rsi<br>
; AVX2-NEXT: shrq $56, %rax<br>
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0<br>
+; AVX2-NEXT: andl $15, %eax<br>
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0<br>
+; AVX2-NEXT: shlq $56, %rax<br>
+; AVX2-NEXT: orq %rsi, %rax<br>
+; AVX2-NEXT: vmovq %xmm0, %rcx<br>
+; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)<br>
; AVX2-NEXT: movl %ecx, %eax<br>
; AVX2-NEXT: shrl $8, %eax<br>
; AVX2-NEXT: vmovd %ecx, %xmm1<br>
@@ -1239,30 +1150,31 @@ define <32 x i8> @_clearupper32xi8b(<32<br>
; AVX2-NEXT: movq %rcx, %rax<br>
; AVX2-NEXT: shrq $48, %rax<br>
; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1<br>
-; AVX2-NEXT: vpextrq $1, %xmm2, %rax<br>
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax<br>
; AVX2-NEXT: shrq $56, %rcx<br>
-; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0<br>
; AVX2-NEXT: movl %eax, %ecx<br>
; AVX2-NEXT: shrl $8, %ecx<br>
-; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1<br>
-; AVX2-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0<br>
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0<br>
; AVX2-NEXT: movl %eax, %ecx<br>
; AVX2-NEXT: shrl $16, %ecx<br>
-; AVX2-NEXT: vpinsrb $10, %ecx, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0<br>
; AVX2-NEXT: movl %eax, %ecx<br>
; AVX2-NEXT: shrl $24, %ecx<br>
-; AVX2-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0<br>
; AVX2-NEXT: movq %rax, %rcx<br>
; AVX2-NEXT: shrq $32, %rcx<br>
-; AVX2-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0<br>
; AVX2-NEXT: movq %rax, %rcx<br>
; AVX2-NEXT: shrq $40, %rcx<br>
-; AVX2-NEXT: vpinsrb $13, %ecx, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0<br>
; AVX2-NEXT: movq %rax, %rcx<br>
; AVX2-NEXT: shrq $48, %rcx<br>
-; AVX2-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0<br>
; AVX2-NEXT: shrq $56, %rax<br>
-; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1<br>
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0<br>
+; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1<br>
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0<br>
; AVX2-NEXT: retq<br>
%x4 = bitcast <32 x i8> %0 to <64 x i4><br>
<br>
Modified: llvm/trunk/test/CodeGen/X86/load-partial.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/load-partial.ll?rev=366441&r1=366440&r2=366441&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/load-partial.ll?rev=366441&r1=366440&r2=366441&view=diff</a><br>
==============================================================================<br>
--- llvm/trunk/test/CodeGen/X86/load-partial.ll (original)<br>
+++ llvm/trunk/test/CodeGen/X86/load-partial.ll Thu Jul 18 07:33:25 2019<br>
@@ -54,32 +54,14 @@ define <8 x float> @load_float8_float3(<<br>
}<br>
<br>
define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) {<br>
-; SSE2-LABEL: load_float4_float3_as_float2_float:<br>
-; SSE2: # %bb.0:<br>
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero<br>
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]<br>
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]<br>
-; SSE2-NEXT: retq<br>
-;<br>
-; SSSE3-LABEL: load_float4_float3_as_float2_float:<br>
-; SSSE3: # %bb.0:<br>
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero<br>
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]<br>
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]<br>
-; SSSE3-NEXT: retq<br>
-;<br>
-; SSE41-LABEL: load_float4_float3_as_float2_float:<br>
-; SSE41: # %bb.0:<br>
-; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero<br>
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]<br>
-; SSE41-NEXT: retq<br>
+; SSE-LABEL: load_float4_float3_as_float2_float:<br>
+; SSE: # %bb.0:<br>
+; SSE-NEXT: movups (%rdi), %xmm0<br>
+; SSE-NEXT: retq<br>
;<br>
; AVX-LABEL: load_float4_float3_as_float2_float:<br>
; AVX: # %bb.0:<br>
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero<br>
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]<br>
+; AVX-NEXT: vmovups (%rdi), %xmm0<br>
; AVX-NEXT: retq<br>
%2 = bitcast <4 x float>* %0 to <2 x float>*<br>
%3 = load <2 x float>, <2 x float>* %2, align 4<br>
@@ -94,36 +76,14 @@ define <4 x float> @load_float4_float3_a<br>
}<br>
<br>
define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly dereferenceable(16)) {<br>
-; SSE2-LABEL: load_float4_float3_trunc:<br>
-; SSE2: # %bb.0:<br>
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero<br>
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]<br>
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]<br>
-; SSE2-NEXT: retq<br>
-;<br>
-; SSSE3-LABEL: load_float4_float3_trunc:<br>
-; SSSE3: # %bb.0:<br>
-; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero<br>
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]<br>
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero<br>
-; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]<br>
-; SSSE3-NEXT: retq<br>
-;<br>
-; SSE41-LABEL: load_float4_float3_trunc:<br>
-; SSE41: # %bb.0:<br>
-; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero<br>
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]<br>
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]<br>
-; SSE41-NEXT: retq<br>
+; SSE-LABEL: load_float4_float3_trunc:<br>
+; SSE: # %bb.0:<br>
+; SSE-NEXT: movaps (%rdi), %xmm0<br>
+; SSE-NEXT: retq<br>
;<br>
; AVX-LABEL: load_float4_float3_trunc:<br>
; AVX: # %bb.0:<br>
-; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero<br>
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]<br>
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]<br>
+; AVX-NEXT: vmovaps (%rdi), %xmm0<br>
; AVX-NEXT: retq<br>
%2 = bitcast <4 x float>* %0 to i64*<br>
%3 = load i64, i64* %2, align 16<br>
<br>
<br>
_______________________________________________<br>
llvm-commits mailing list<br>
<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a><br>
<a href="https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits</a><br>
</blockquote></div>