[llvm] r363592 - [X86][SSE] Scalarize under-aligned XMM vector nt-stores (PR42026)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 17 11:20:04 PDT 2019
Author: rksimon
Date: Mon Jun 17 11:20:04 2019
New Revision: 363592
URL: http://llvm.org/viewvc/llvm-project?rev=363592&view=rev
Log:
[X86][SSE] Scalarize under-aligned XMM vector nt-stores (PR42026)
If a XMM non-temporal store has less than natural alignment, scalarize the vector - with SSE4A we can stay on the vector and use MOVNTSD(f64), else we must move to GPRs and use MOVNTI(i32/i64).
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=363592&r1=363591&r2=363592&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jun 17 11:20:04 2019
@@ -21110,6 +21110,42 @@ static SDValue splitVectorStore(StoreSDN
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
}
+/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
+/// type.
+static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
+ SelectionDAG &DAG) {
+ SDValue StoredVal = Store->getValue();
+ assert(StoreVT.is128BitVector() &&
+ StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
+ StoredVal = DAG.getBitcast(StoreVT, StoredVal);
+
+ // Splitting volatile memory ops is not allowed unless the operation was not
+ // legal to begin with. We are assuming the input op is legal (this transform
+ // is only used for targets with AVX).
+ if (Store->isVolatile())
+ return SDValue();
+
+ MVT StoreSVT = StoreVT.getScalarType();
+ unsigned NumElems = StoreVT.getVectorNumElements();
+ unsigned ScalarSize = StoreSVT.getStoreSize();
+ unsigned Alignment = Store->getAlignment();
+
+ SDLoc DL(Store);
+ SmallVector<SDValue, 4> Stores;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Offset = i * ScalarSize;
+ SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), Offset, DL);
+ SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
+ DAG.getIntPtrConstant(i, DL));
+ SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
+ Store->getPointerInfo().getWithOffset(Offset),
+ MinAlign(Alignment, Offset),
+ Store->getMemOperand()->getFlags());
+ Stores.push_back(Ch);
+ }
+ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
+}
+
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
@@ -39640,6 +39676,15 @@ static SDValue combineStore(SDNode *N, S
return SDValue();
return splitVectorStore(St, DAG);
}
+
+ // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
+ // to use MOVNTI.
+ if (VT.is128BitVector() && Subtarget.hasSSE2()) {
+ MVT NTVT = Subtarget.hasSSE4A()
+ ? MVT::v2f64
+ : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
+ return scalarizeVectorStore(St, NTVT, DAG);
+ }
}
// Optimize trunc store (of multiple scalars) to shuffle and store.
Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll?rev=363592&r1=363591&r2=363592&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll Mon Jun 17 11:20:04 2019
@@ -298,67 +298,105 @@ define void @merge_2_v4f32_align1_ntload
; Nothing can perform NT-store-vector on 1-byte aligned memory.
; Must be scalarized to use MOVTNI/MOVNTSD.
define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
-; X86-LABEL: merge_2_v4f32_align1_ntstore:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $48, %esp
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movups (%ecx), %xmm0
-; X86-NEXT: movups 16(%ecx), %xmm1
-; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 12(%eax)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 8(%eax)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movntil %edx, 4(%eax)
-; X86-NEXT: movntil %ecx, (%eax)
-; X86-NEXT: movaps %xmm1, (%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 28(%eax)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 24(%eax)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movntil %edx, 20(%eax)
-; X86-NEXT: movntil %ecx, 16(%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
-;
-; X64-SSE-LABEL: merge_2_v4f32_align1_ntstore:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movups (%rdi), %xmm0
-; X64-SSE-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT: movntiq %rcx, 8(%rsi)
-; X64-SSE-NEXT: movntiq %rax, (%rsi)
-; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT: movntiq %rcx, 24(%rsi)
-; X64-SSE-NEXT: movntiq %rax, 16(%rsi)
-; X64-SSE-NEXT: retq
+; X86-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, (%eax)
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT: movd %xmm2, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movd %xmm2, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
+; X86-SSE2-NEXT: movd %xmm1, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm1, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
+; X86-SSE4A: # %bb.0:
+; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE4A-NEXT: movups (%ecx), %xmm0
+; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
+; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
+; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
+; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
+; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
+; X86-SSE4A-NEXT: retl
+;
+; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT: movq %xmm0, %rax
+; X64-SSE2-NEXT: movntiq %rax, (%rsi)
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movq %xmm0, %rax
+; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
+; X64-SSE2-NEXT: movq %xmm1, %rax
+; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movq %xmm0, %rax
+; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE4A-LABEL: merge_2_v4f32_align1_ntstore:
+; X64-SSE4A: # %bb.0:
+; X64-SSE4A-NEXT: movups (%rdi), %xmm0
+; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
+; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
+; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
+; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
+; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
+; X64-SSE4A-NEXT: retq
+;
+; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
+; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax
+; X64-SSE41-NEXT: movntiq %rax, 8(%rsi)
+; X64-SSE41-NEXT: movq %xmm0, %rax
+; X64-SSE41-NEXT: movntiq %rax, (%rsi)
+; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT: movntiq %rax, 24(%rsi)
+; X64-SSE41-NEXT: movq %xmm1, %rax
+; X64-SSE41-NEXT: movntiq %rax, 16(%rsi)
+; X64-SSE41-NEXT: retq
;
; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %xmm0
-; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1
-; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT: movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
+; X64-AVX-NEXT: movntiq %rax, 8(%rsi)
+; X64-AVX-NEXT: vmovq %xmm0, %rax
; X64-AVX-NEXT: movntiq %rax, (%rsi)
-; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT: movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax
+; X64-AVX-NEXT: movntiq %rax, 24(%rsi)
+; X64-AVX-NEXT: vmovq %xmm1, %rax
; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
@@ -375,67 +413,105 @@ define void @merge_2_v4f32_align1_ntstor
; Nothing can perform NT-load-vector on 1-byte aligned memory.
; Just perform regular loads and scalarize NT-stores.
define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
-; X86-LABEL: merge_2_v4f32_align1:
-; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
-; X86-NEXT: movl %esp, %ebp
-; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $48, %esp
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movups (%ecx), %xmm0
-; X86-NEXT: movups 16(%ecx), %xmm1
-; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 12(%eax)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 8(%eax)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movntil %edx, 4(%eax)
-; X86-NEXT: movntil %ecx, (%eax)
-; X86-NEXT: movaps %xmm1, (%esp)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 28(%eax)
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movntil %ecx, 24(%eax)
-; X86-NEXT: movl (%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movntil %edx, 20(%eax)
-; X86-NEXT: movntil %ecx, 16(%eax)
-; X86-NEXT: movl %ebp, %esp
-; X86-NEXT: popl %ebp
-; X86-NEXT: retl
-;
-; X64-SSE-LABEL: merge_2_v4f32_align1:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movups (%rdi), %xmm0
-; X64-SSE-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT: movntiq %rcx, 8(%rsi)
-; X64-SSE-NEXT: movntiq %rax, (%rsi)
-; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-SSE-NEXT: movntiq %rcx, 24(%rsi)
-; X64-SSE-NEXT: movntiq %rax, 16(%rsi)
-; X64-SSE-NEXT: retq
+; X86-SSE2-LABEL: merge_2_v4f32_align1:
+; X86-SSE2: # %bb.0:
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT: movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, (%eax)
+; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT: movd %xmm2, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; X86-SSE2-NEXT: movd %xmm2, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
+; X86-SSE2-NEXT: movd %xmm1, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
+; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
+; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm1, %ecx
+; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
+; X86-SSE2-NEXT: retl
+;
+; X86-SSE4A-LABEL: merge_2_v4f32_align1:
+; X86-SSE4A: # %bb.0:
+; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE4A-NEXT: movups (%ecx), %xmm0
+; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1
+; X86-SSE4A-NEXT: movntsd %xmm0, (%eax)
+; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax)
+; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax)
+; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax)
+; X86-SSE4A-NEXT: retl
+;
+; X64-SSE2-LABEL: merge_2_v4f32_align1:
+; X64-SSE2: # %bb.0:
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
+; X64-SSE2-NEXT: movq %xmm0, %rax
+; X64-SSE2-NEXT: movntiq %rax, (%rsi)
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-SSE2-NEXT: movq %xmm0, %rax
+; X64-SSE2-NEXT: movntiq %rax, 8(%rsi)
+; X64-SSE2-NEXT: movq %xmm1, %rax
+; X64-SSE2-NEXT: movntiq %rax, 16(%rsi)
+; X64-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; X64-SSE2-NEXT: movq %xmm0, %rax
+; X64-SSE2-NEXT: movntiq %rax, 24(%rsi)
+; X64-SSE2-NEXT: retq
+;
+; X64-SSE4A-LABEL: merge_2_v4f32_align1:
+; X64-SSE4A: # %bb.0:
+; X64-SSE4A-NEXT: movups (%rdi), %xmm0
+; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1
+; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi)
+; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi)
+; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi)
+; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi)
+; X64-SSE4A-NEXT: retq
+;
+; X64-SSE41-LABEL: merge_2_v4f32_align1:
+; X64-SSE41: # %bb.0:
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
+; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
+; X64-SSE41-NEXT: pextrq $1, %xmm0, %rax
+; X64-SSE41-NEXT: movntiq %rax, 8(%rsi)
+; X64-SSE41-NEXT: movq %xmm0, %rax
+; X64-SSE41-NEXT: movntiq %rax, (%rsi)
+; X64-SSE41-NEXT: pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT: movntiq %rax, 24(%rsi)
+; X64-SSE41-NEXT: movq %xmm1, %rax
+; X64-SSE41-NEXT: movntiq %rax, 16(%rsi)
+; X64-SSE41-NEXT: retq
;
; X64-AVX-LABEL: merge_2_v4f32_align1:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %xmm0
-; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1
-; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT: movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
+; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
+; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax
+; X64-AVX-NEXT: movntiq %rax, 8(%rsi)
+; X64-AVX-NEXT: vmovq %xmm0, %rax
; X64-AVX-NEXT: movntiq %rax, (%rsi)
-; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; X64-AVX-NEXT: movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT: vpextrq $1, %xmm1, %rax
+; X64-AVX-NEXT: movntiq %rax, 24(%rsi)
+; X64-AVX-NEXT: vmovq %xmm1, %rax
; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
Modified: llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-3.ll?rev=363592&r1=363591&r2=363592&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-3.ll Mon Jun 17 11:20:04 2019
@@ -14,31 +14,22 @@
define void @test_zero_v2f64_align1(<2 x double>* %dst) nounwind {
; SSE-LABEL: test_zero_v2f64_align1:
; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntiq %rax, 8(%rdi)
; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v2f64_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v2f64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1
@@ -46,33 +37,39 @@ define void @test_zero_v2f64_align1(<2 x
}
define void @test_zero_v4f32_align1(<4 x float>* %dst) nounwind {
-; SSE-LABEL: test_zero_v4f32_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v4f32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v4f32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorl %eax, %eax
+; SSE4A-NEXT: movntiq %rax, 8(%rdi)
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v4f32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v4f32_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4f32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1
@@ -80,33 +77,39 @@ define void @test_zero_v4f32_align1(<4 x
}
define void @test_zero_v2i64_align1(<2 x i64>* %dst) nounwind {
-; SSE-LABEL: test_zero_v2i64_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v2i64_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v2i64_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorl %eax, %eax
+; SSE4A-NEXT: movntiq %rax, 8(%rdi)
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v2i64_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v2i64_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v2i64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1
@@ -114,33 +117,39 @@ define void @test_zero_v2i64_align1(<2 x
}
define void @test_zero_v4i32_align1(<4 x i32>* %dst) nounwind {
-; SSE-LABEL: test_zero_v4i32_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v4i32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v4i32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorl %eax, %eax
+; SSE4A-NEXT: movntiq %rax, 8(%rdi)
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v4i32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v4i32_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4i32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1
@@ -148,33 +157,39 @@ define void @test_zero_v4i32_align1(<4 x
}
define void @test_zero_v8i16_align1(<8 x i16>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8i16_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v8i16_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v8i16_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorl %eax, %eax
+; SSE4A-NEXT: movntiq %rax, 8(%rdi)
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v8i16_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v8i16_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i16_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1
@@ -182,33 +197,39 @@ define void @test_zero_v8i16_align1(<8 x
}
define void @test_zero_v16i8_align1(<16 x i8>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16i8_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v16i8_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v16i8_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorl %eax, %eax
+; SSE4A-NEXT: movntiq %rax, 8(%rdi)
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v16i8_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v16i8_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i8_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1
@@ -220,292 +241,274 @@ define void @test_zero_v16i8_align1(<16
define void @test_zero_v4f64_align1(<4 x double>* %dst) nounwind {
; SSE-LABEL: test_zero_v4f64_align1:
; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntiq %rax, 8(%rdi)
; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rax, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4f64_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
-; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4f64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
-; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
; AVX512-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8f32_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v8f32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v8f32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v8f32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v8f32_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
-; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
-; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
; AVX512-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind {
-; SSE-LABEL: test_zero_v4i64_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v4i64_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v4i64_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v4i64_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v4i64_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
-; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4i64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
-; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
; AVX512-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8i32_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v8i32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v8i32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v8i32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v8i32_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
-; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
-; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
; AVX512-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16i16_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v16i16_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v16i16_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v16i16_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v16i16_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
-; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i16_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
-; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
; AVX512-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind {
-; SSE-LABEL: test_zero_v32i8_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v32i8_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v32i8_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v32i8_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v32i8_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
-; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v32i8_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
-; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
; AVX512-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1
ret void
@@ -666,76 +669,40 @@ define void @test_zero_v32i8_align16(<32
define void @test_zero_v8f64_align1(<8 x double>* %dst) nounwind {
; SSE-LABEL: test_zero_v8f64_align1:
; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: movntiq %rax, 24(%rdi)
; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, 8(%rdi)
; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 56(%rdi)
; SSE-NEXT: movntiq %rax, 48(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 40(%rdi)
; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f64_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 24(%rdi)
; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
; AVX-NEXT: movntiq %rax, 48(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
@@ -743,78 +710,68 @@ define void @test_zero_v8f64_align1(<8 x
}
define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16f32_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v16f32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v16f32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v16f32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v16f32_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 24(%rdi)
; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
; AVX-NEXT: movntiq %rax, 48(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16f32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
@@ -822,78 +779,68 @@ define void @test_zero_v16f32_align1(<16
}
define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind {
-; SSE-LABEL: test_zero_v8i64_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v8i64_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v8i64_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v8i64_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v8i64_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 24(%rdi)
; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
; AVX-NEXT: movntiq %rax, 48(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
@@ -901,78 +848,68 @@ define void @test_zero_v8i64_align1(<8 x
}
define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind {
-; SSE-LABEL: test_zero_v16i32_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v16i32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v16i32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v16i32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v16i32_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 24(%rdi)
; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
; AVX-NEXT: movntiq %rax, 48(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
@@ -980,78 +917,68 @@ define void @test_zero_v16i32_align1(<16
}
define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind {
-; SSE-LABEL: test_zero_v32i16_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v32i16_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v32i16_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v32i16_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v32i16_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 24(%rdi)
; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
; AVX-NEXT: movntiq %rax, 48(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v32i16_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
@@ -1059,78 +986,68 @@ define void @test_zero_v32i16_align1(<32
}
define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind {
-; SSE-LABEL: test_zero_v64i8_align1:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_zero_v64i8_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v64i8_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v64i8_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_zero_v64i8_align1:
; AVX: # %bb.0:
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 24(%rdi)
; AVX-NEXT: movntiq %rax, 16(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, 8(%rdi)
; AVX-NEXT: movntiq %rax, (%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
; AVX-NEXT: movntiq %rax, 48(%rdi)
-; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v64i8_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1
More information about the llvm-commits
mailing list