[llvm] r363582 - [X86][AVX] Split under-aligned vector nt-stores.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 17 10:22:38 PDT 2019
Author: rksimon
Date: Mon Jun 17 10:22:38 2019
New Revision: 363582
URL: http://llvm.org/viewvc/llvm-project?rev=363582&view=rev
Log:
[X86][AVX] Split under-aligned vector nt-stores.
If a YMM/ZMM non-temporal store has less than natural alignment, split the vector - either they will be satisfactorily aligned or will continue to be split until they are XMMs - at which point the legalizer will scalarize it.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=363582&r1=363581&r2=363582&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jun 17 10:22:38 2019
@@ -39545,6 +39545,7 @@ static SDValue combineStore(SDNode *N, S
EVT VT = St->getValue().getValueType();
EVT StVT = St->getMemoryVT();
SDLoc dl(St);
+ unsigned Alignment = St->getAlignment();
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -39595,8 +39596,6 @@ static SDValue combineStore(SDNode *N, S
StoredVal->ops().slice(32, 32));
Hi = combinevXi1ConstantToInteger(Hi, DAG);
- unsigned Alignment = St->getAlignment();
-
SDValue Ptr0 = St->getBasePtr();
SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
@@ -39631,6 +39630,18 @@ static SDValue combineStore(SDNode *N, S
return splitVectorStore(St, DAG);
}
+ // Split under-aligned vector non-temporal stores.
+ if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+ // ZMM/YMM nt-stores - either it can be stored as a series of shorter
+ // vectors or the legalizer can scalarize it to use MOVNTI.
+ if (VT.is256BitVector() || VT.is512BitVector()) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+ return splitVectorStore(St, DAG);
+ }
+ }
+
// Optimize trunc store (of multiple scalars) to shuffle and store.
// First, pack all of the elements in one place. Next, store to memory
// in fewer chunks.
Modified: llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-2.ll?rev=363582&r1=363581&r2=363582&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-2.ll Mon Jun 17 10:22:38 2019
@@ -1230,9 +1230,7 @@ define void @test_op_v32i8(<32 x i8> %a,
}
; 256-bit NT stores require 256-bit alignment.
-; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
-; could even scalarize to movnti when we have 1-alignment: nontemporal is
-; probably always worth even some 20 instruction scalarization.
+; For AVX, we lower 128-bit alignment as 2x movntps %xmm.
define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
; SSE-LABEL: test_unaligned_v8f32:
; SSE: # %bb.0:
@@ -1245,14 +1243,18 @@ define void @test_unaligned_v8f32(<8 x f
; AVX-LABEL: test_unaligned_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; VLX-LABEL: test_unaligned_v8f32:
; VLX: # %bb.0:
; VLX-NEXT: vaddps %ymm1, %ymm0, %ymm0
-; VLX-NEXT: vmovups %ymm0, (%rdi)
+; VLX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; VLX-NEXT: vmovntps %xmm1, 16(%rdi)
+; VLX-NEXT: vmovntps %xmm0, (%rdi)
; VLX-NEXT: vzeroupper
; VLX-NEXT: retq
%r = fadd <8 x float> %a, %b
Modified: llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-3.ll?rev=363582&r1=363581&r2=363582&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-3.ll Mon Jun 17 10:22:38 2019
@@ -236,15 +236,31 @@ define void @test_zero_v4f64_align1(<4 x
; AVX-LABEL: test_zero_v4f64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4f64_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1
ret void
@@ -269,15 +285,31 @@ define void @test_zero_v8f32_align1(<8 x
; AVX-LABEL: test_zero_v8f32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f32_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1
ret void
@@ -302,15 +334,31 @@ define void @test_zero_v4i64_align1(<4 x
; AVX-LABEL: test_zero_v4i64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4i64_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1
ret void
@@ -335,15 +383,31 @@ define void @test_zero_v8i32_align1(<8 x
; AVX-LABEL: test_zero_v8i32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i32_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1
ret void
@@ -368,15 +432,31 @@ define void @test_zero_v16i16_align1(<16
; AVX-LABEL: test_zero_v16i16_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i16_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1
ret void
@@ -401,15 +481,31 @@ define void @test_zero_v32i8_align1(<32
; AVX-LABEL: test_zero_v32i8_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v32i8_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1
ret void
@@ -426,15 +522,15 @@ define void @test_zero_v4f64_align16(<4
; AVX-LABEL: test_zero_v4f64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4f64_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <4 x double> zeroinitializer, <4 x double>* %dst, align 16, !nontemporal !1
ret void
@@ -451,15 +547,15 @@ define void @test_zero_v8f32_align16(<8
; AVX-LABEL: test_zero_v8f32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f32_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <8 x float> zeroinitializer, <8 x float>* %dst, align 16, !nontemporal !1
ret void
@@ -476,15 +572,15 @@ define void @test_zero_v4i64_align16(<4
; AVX-LABEL: test_zero_v4i64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4i64_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 16, !nontemporal !1
ret void
@@ -501,15 +597,15 @@ define void @test_zero_v8i32_align16(<8
; AVX-LABEL: test_zero_v8i32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i32_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 16, !nontemporal !1
ret void
@@ -526,15 +622,15 @@ define void @test_zero_v16i16_align16(<1
; AVX-LABEL: test_zero_v16i16_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i16_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 16, !nontemporal !1
ret void
@@ -551,15 +647,15 @@ define void @test_zero_v32i8_align16(<32
; AVX-LABEL: test_zero_v32i8_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v32i8_align16:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %ymm0, (%rdi)
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
; AVX512-NEXT: retq
store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 16, !nontemporal !1
ret void
@@ -574,60 +670,73 @@ define void @test_zero_v8f64_align1(<8 x
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
ret void
@@ -640,60 +749,73 @@ define void @test_zero_v16f32_align1(<16
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16f32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16f32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
ret void
@@ -706,60 +828,73 @@ define void @test_zero_v8i64_align1(<8 x
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
ret void
@@ -772,60 +907,73 @@ define void @test_zero_v16i32_align1(<16
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 56(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 48(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 40(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 32(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movntiq %rax, 24(%rdi)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
; AVX512-NEXT: movntiq %rax, 16(%rdi)
-; AVX512-NEXT: movq (%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: movntiq %rcx, 8(%rdi)
; AVX512-NEXT: movntiq %rax, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
ret void
@@ -838,69 +986,74 @@ define void @test_zero_v32i16_align1(<32
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v32i16_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
-; AVX512DQ-LABEL: test_zero_v32i16_align1:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 56(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 48(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 40(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 32(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 24(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 16(%rdi)
-; AVX512BW-NEXT: movq (%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512BW-NEXT: movntiq %rcx, 8(%rdi)
-; AVX512BW-NEXT: movntiq %rax, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v32i16_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
ret void
}
@@ -912,69 +1065,74 @@ define void @test_zero_v64i8_align1(<64
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 56(%rdi)
-; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 40(%rdi)
-; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 24(%rdi)
-; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: movntiq %rcx, 8(%rdi)
-; SSE-NEXT: movntiq %rax, (%rdi)
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v64i8_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
-; AVX512DQ-LABEL: test_zero_v64i8_align1:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align1:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 56(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 48(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 40(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 32(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 24(%rdi)
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: movntiq %rax, 16(%rdi)
-; AVX512BW-NEXT: movq (%rsp), %rax
-; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512BW-NEXT: movntiq %rcx, 8(%rdi)
-; AVX512BW-NEXT: movntiq %rax, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v64i8_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1
ret void
}
@@ -983,39 +1141,28 @@ define void @test_zero_v8f64_align16(<8
; SSE-LABEL: test_zero_v8f64_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8f64_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1
ret void
@@ -1025,39 +1172,28 @@ define void @test_zero_v16f32_align16(<1
; SSE-LABEL: test_zero_v16f32_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16f32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16f32_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1
ret void
@@ -1067,39 +1203,28 @@ define void @test_zero_v8i64_align16(<8
; SSE-LABEL: test_zero_v8i64_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i64_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i64_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1
ret void
@@ -1109,39 +1234,28 @@ define void @test_zero_v16i32_align16(<1
; SSE-LABEL: test_zero_v16i32_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i32_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i32_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %xmm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
; AVX512-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1
ret void
@@ -1151,48 +1265,29 @@ define void @test_zero_v32i16_align16(<3
; SSE-LABEL: test_zero_v32i16_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v32i16_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v32i16_align16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %xmm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v32i16_align16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
+; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1
ret void
}
@@ -1201,48 +1296,29 @@ define void @test_zero_v64i8_align16(<64
; SSE-LABEL: test_zero_v64i8_align16:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movntps %xmm0, 48(%rdi)
-; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: movntps %xmm0, 16(%rdi)
; SSE-NEXT: movntps %xmm0, (%rdi)
+; SSE-NEXT: movntps %xmm0, 48(%rdi)
+; SSE-NEXT: movntps %xmm0, 32(%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v64i8_align16:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX-NEXT: vmovups %ymm0, (%rdi)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT: vmovntps %xmm0, (%rdi)
+; AVX-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT: vmovntps %xmm0, 32(%rdi)
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v64i8_align16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %xmm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi)
-; AVX512BW-NEXT: vmovntps %xmm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v64i8_align16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, 32(%rdi)
+; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1
ret void
}
@@ -1267,18 +1343,9 @@ define void @test_zero_v8f64_align32(<8
;
; AVX512-LABEL: test_zero_v8f64_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1
@@ -1305,18 +1372,9 @@ define void @test_zero_v16f32_align32(<1
;
; AVX512-LABEL: test_zero_v16f32_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1
@@ -1343,18 +1401,9 @@ define void @test_zero_v8i64_align32(<8
;
; AVX512-LABEL: test_zero_v8i64_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1
@@ -1381,18 +1430,9 @@ define void @test_zero_v16i32_align32(<1
;
; AVX512-LABEL: test_zero_v16i32_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: movq %rsp, %rbp
-; AVX512-NEXT: andq $-64, %rsp
-; AVX512-NEXT: subq $128, %rsp
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512-NEXT: vmovaps (%rsp), %ymm0
-; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
; AVX512-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512-NEXT: movq %rbp, %rsp
-; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1
@@ -1417,30 +1457,13 @@ define void @test_zero_v32i16_align32(<3
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v32i16_align32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovntps %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %ymm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v32i16_align32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1
ret void
}
@@ -1463,30 +1486,13 @@ define void @test_zero_v64i8_align32(<64
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
-; AVX512DQ-LABEL: test_zero_v64i8_align32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovntps %ymm0, 32(%rdi)
-; AVX512DQ-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: movq %rsp, %rbp
-; AVX512BW-NEXT: andq $-64, %rsp
-; AVX512BW-NEXT: subq $128, %rsp
-; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT: vmovaps (%rsp), %ymm0
-; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi)
-; AVX512BW-NEXT: vmovntps %ymm0, (%rdi)
-; AVX512BW-NEXT: movq %rbp, %rsp
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: test_zero_v64i8_align32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovntps %ymm0, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1
ret void
}
More information about the llvm-commits
mailing list