[llvm] r363582 - [X86][AVX] Split under-aligned vector nt-stores.

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 17 10:22:38 PDT 2019


Author: rksimon
Date: Mon Jun 17 10:22:38 2019
New Revision: 363582

URL: http://llvm.org/viewvc/llvm-project?rev=363582&view=rev
Log:
[X86][AVX] Split under-aligned vector nt-stores.

If a YMM/ZMM non-temporal store has less than natural alignment, split the vector - either they will be satisfactorily aligned or will continue to be split until they are XMMs - at which point the legalizer will scalarize it.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
    llvm/trunk/test/CodeGen/X86/nontemporal-3.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=363582&r1=363581&r2=363582&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jun 17 10:22:38 2019
@@ -39545,6 +39545,7 @@ static SDValue combineStore(SDNode *N, S
   EVT VT = St->getValue().getValueType();
   EVT StVT = St->getMemoryVT();
   SDLoc dl(St);
+  unsigned Alignment = St->getAlignment();
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
@@ -39595,8 +39596,6 @@ static SDValue combineStore(SDNode *N, S
                                       StoredVal->ops().slice(32, 32));
       Hi = combinevXi1ConstantToInteger(Hi, DAG);
 
-      unsigned Alignment = St->getAlignment();
-
       SDValue Ptr0 = St->getBasePtr();
       SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
 
@@ -39631,6 +39630,18 @@ static SDValue combineStore(SDNode *N, S
     return splitVectorStore(St, DAG);
   }
 
+  // Split under-aligned vector non-temporal stores.
+  if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+    // ZMM/YMM nt-stores - either it can be stored as a series of shorter
+    // vectors or the legalizer can scalarize it to use MOVNTI.
+    if (VT.is256BitVector() || VT.is512BitVector()) {
+      unsigned NumElems = VT.getVectorNumElements();
+      if (NumElems < 2)
+        return SDValue();
+      return splitVectorStore(St, DAG);
+    }
+  }
+
   // Optimize trunc store (of multiple scalars) to shuffle and store.
   // First, pack all of the elements in one place. Next, store to memory
   // in fewer chunks.

Modified: llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-2.ll?rev=363582&r1=363581&r2=363582&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-2.ll Mon Jun 17 10:22:38 2019
@@ -1230,9 +1230,7 @@ define void @test_op_v32i8(<32 x i8> %a,
 }
 
 ; 256-bit NT stores require 256-bit alignment.
-; FIXME: For AVX, we could lower this to 2x movntps %xmm. Taken further, we
-; could even scalarize to movnti when we have 1-alignment: nontemporal is
-; probably always worth even some 20 instruction scalarization.
+; For AVX, we lower 128-bit alignment as 2x movntps %xmm.
 define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
 ; SSE-LABEL: test_unaligned_v8f32:
 ; SSE:       # %bb.0:
@@ -1245,14 +1243,18 @@ define void @test_unaligned_v8f32(<8 x f
 ; AVX-LABEL: test_unaligned_v8f32:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
 ; VLX-LABEL: test_unaligned_v8f32:
 ; VLX:       # %bb.0:
 ; VLX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
-; VLX-NEXT:    vmovups %ymm0, (%rdi)
+; VLX-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; VLX-NEXT:    vmovntps %xmm1, 16(%rdi)
+; VLX-NEXT:    vmovntps %xmm0, (%rdi)
 ; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = fadd <8 x float> %a, %b

Modified: llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-3.ll?rev=363582&r1=363581&r2=363582&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-3.ll Mon Jun 17 10:22:38 2019
@@ -236,15 +236,31 @@ define void @test_zero_v4f64_align1(<4 x
 ; AVX-LABEL: test_zero_v4f64_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4f64_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x double> zeroinitializer, <4 x double>* %dst, align 1, !nontemporal !1
   ret void
@@ -269,15 +285,31 @@ define void @test_zero_v8f32_align1(<8 x
 ; AVX-LABEL: test_zero_v8f32_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8f32_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <8 x float> zeroinitializer, <8 x float>* %dst, align 1, !nontemporal !1
   ret void
@@ -302,15 +334,31 @@ define void @test_zero_v4i64_align1(<4 x
 ; AVX-LABEL: test_zero_v4i64_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4i64_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 1, !nontemporal !1
   ret void
@@ -335,15 +383,31 @@ define void @test_zero_v8i32_align1(<8 x
 ; AVX-LABEL: test_zero_v8i32_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i32_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 1, !nontemporal !1
   ret void
@@ -368,15 +432,31 @@ define void @test_zero_v16i16_align1(<16
 ; AVX-LABEL: test_zero_v16i16_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i16_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 1, !nontemporal !1
   ret void
@@ -401,15 +481,31 @@ define void @test_zero_v32i8_align1(<32
 ; AVX-LABEL: test_zero_v32i8_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v32i8_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 1, !nontemporal !1
   ret void
@@ -426,15 +522,15 @@ define void @test_zero_v4f64_align16(<4
 ; AVX-LABEL: test_zero_v4f64_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4f64_align16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x double> zeroinitializer, <4 x double>* %dst, align 16, !nontemporal !1
   ret void
@@ -451,15 +547,15 @@ define void @test_zero_v8f32_align16(<8
 ; AVX-LABEL: test_zero_v8f32_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8f32_align16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <8 x float> zeroinitializer, <8 x float>* %dst, align 16, !nontemporal !1
   ret void
@@ -476,15 +572,15 @@ define void @test_zero_v4i64_align16(<4
 ; AVX-LABEL: test_zero_v4i64_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4i64_align16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 16, !nontemporal !1
   ret void
@@ -501,15 +597,15 @@ define void @test_zero_v8i32_align16(<8
 ; AVX-LABEL: test_zero_v8i32_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i32_align16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 16, !nontemporal !1
   ret void
@@ -526,15 +622,15 @@ define void @test_zero_v16i16_align16(<1
 ; AVX-LABEL: test_zero_v16i16_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i16_align16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 16, !nontemporal !1
   ret void
@@ -551,15 +647,15 @@ define void @test_zero_v32i8_align16(<32
 ; AVX-LABEL: test_zero_v32i8_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v32i8_align16:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
 ; AVX512-NEXT:    retq
   store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 16, !nontemporal !1
   ret void
@@ -574,60 +670,73 @@ define void @test_zero_v8f64_align1(<8 x
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8f64_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8f64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movq (%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
   ret void
@@ -640,60 +749,73 @@ define void @test_zero_v16f32_align1(<16
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16f32_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16f32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movq (%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
   ret void
@@ -706,60 +828,73 @@ define void @test_zero_v8i64_align1(<8 x
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i64_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movq (%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
   ret void
@@ -772,60 +907,73 @@ define void @test_zero_v16i32_align1(<16
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i32_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
 ; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movq (%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
 ; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
   ret void
@@ -838,69 +986,74 @@ define void @test_zero_v32i16_align1(<32
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i16_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
-; AVX512DQ-LABEL: test_zero_v32i16_align1:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align1:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    movq %rsp, %rbp
-; AVX512BW-NEXT:    andq $-64, %rsp
-; AVX512BW-NEXT:    subq $128, %rsp
-; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512BW-NEXT:    movq (%rsp), %rax
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512BW-NEXT:    movntiq %rcx, 8(%rdi)
-; AVX512BW-NEXT:    movntiq %rax, (%rdi)
-; AVX512BW-NEXT:    movq %rbp, %rsp
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v32i16_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    retq
   store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
   ret void
 }
@@ -912,69 +1065,74 @@ define void @test_zero_v64i8_align1(<64
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 56(%rdi)
-; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 40(%rdi)
-; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 24(%rdi)
-; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    movntiq %rcx, 8(%rdi)
-; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v64i8_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
-;
-; AVX512DQ-LABEL: test_zero_v64i8_align1:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align1:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    movq %rsp, %rbp
-; AVX512BW-NEXT:    andq $-64, %rsp
-; AVX512BW-NEXT:    subq $128, %rsp
-; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512BW-NEXT:    movq (%rsp), %rax
-; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512BW-NEXT:    movntiq %rcx, 8(%rdi)
-; AVX512BW-NEXT:    movntiq %rax, (%rdi)
-; AVX512BW-NEXT:    movq %rbp, %rsp
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v64i8_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    retq
   store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1
   ret void
 }
@@ -983,39 +1141,28 @@ define void @test_zero_v8f64_align16(<8
 ; SSE-LABEL: test_zero_v8f64_align16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movntps %xmm0, 48(%rdi)
-; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    movntps %xmm0, 16(%rdi)
 ; SSE-NEXT:    movntps %xmm0, (%rdi)
+; SSE-NEXT:    movntps %xmm0, 48(%rdi)
+; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8f64_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8f64_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %xmm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1
   ret void
@@ -1025,39 +1172,28 @@ define void @test_zero_v16f32_align16(<1
 ; SSE-LABEL: test_zero_v16f32_align16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movntps %xmm0, 48(%rdi)
-; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    movntps %xmm0, 16(%rdi)
 ; SSE-NEXT:    movntps %xmm0, (%rdi)
+; SSE-NEXT:    movntps %xmm0, 48(%rdi)
+; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16f32_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16f32_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %xmm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1
   ret void
@@ -1067,39 +1203,28 @@ define void @test_zero_v8i64_align16(<8
 ; SSE-LABEL: test_zero_v8i64_align16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movntps %xmm0, 48(%rdi)
-; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    movntps %xmm0, 16(%rdi)
 ; SSE-NEXT:    movntps %xmm0, (%rdi)
+; SSE-NEXT:    movntps %xmm0, 48(%rdi)
+; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i64_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i64_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %xmm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1
   ret void
@@ -1109,39 +1234,28 @@ define void @test_zero_v16i32_align16(<1
 ; SSE-LABEL: test_zero_v16i32_align16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movntps %xmm0, 48(%rdi)
-; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    movntps %xmm0, 16(%rdi)
 ; SSE-NEXT:    movntps %xmm0, (%rdi)
+; SSE-NEXT:    movntps %xmm0, 48(%rdi)
+; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i32_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i32_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %xmm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
-; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
-; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
 ; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1
   ret void
@@ -1151,48 +1265,29 @@ define void @test_zero_v32i16_align16(<3
 ; SSE-LABEL: test_zero_v32i16_align16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movntps %xmm0, 48(%rdi)
-; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    movntps %xmm0, 16(%rdi)
 ; SSE-NEXT:    movntps %xmm0, (%rdi)
+; SSE-NEXT:    movntps %xmm0, 48(%rdi)
+; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i16_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
-; AVX512DQ-LABEL: test_zero_v32i16_align16:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    movq %rsp, %rbp
-; AVX512BW-NEXT:    andq $-64, %rsp
-; AVX512BW-NEXT:    subq $128, %rsp
-; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT:    vmovaps (%rsp), %xmm0
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512BW-NEXT:    vmovntps %xmm3, 48(%rdi)
-; AVX512BW-NEXT:    vmovntps %xmm2, 32(%rdi)
-; AVX512BW-NEXT:    vmovntps %xmm1, 16(%rdi)
-; AVX512BW-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512BW-NEXT:    movq %rbp, %rsp
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512-LABEL: test_zero_v32i16_align16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
+; AVX512-NEXT:    retq
   store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1
   ret void
 }
@@ -1201,48 +1296,29 @@ define void @test_zero_v64i8_align16(<64
 ; SSE-LABEL: test_zero_v64i8_align16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movntps %xmm0, 48(%rdi)
-; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    movntps %xmm0, 16(%rdi)
 ; SSE-NEXT:    movntps %xmm0, (%rdi)
+; SSE-NEXT:    movntps %xmm0, 48(%rdi)
+; SSE-NEXT:    movntps %xmm0, 32(%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v64i8_align16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX-NEXT:    vmovups %ymm0, (%rdi)
-; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX-NEXT:    vmovntps %xmm0, 32(%rdi)
 ; AVX-NEXT:    retq
 ;
-; AVX512DQ-LABEL: test_zero_v64i8_align16:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovups %ymm0, 32(%rdi)
-; AVX512DQ-NEXT:    vmovups %ymm0, (%rdi)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    movq %rsp, %rbp
-; AVX512BW-NEXT:    andq $-64, %rsp
-; AVX512BW-NEXT:    subq $128, %rsp
-; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT:    vmovaps (%rsp), %xmm0
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
-; AVX512BW-NEXT:    vmovntps %xmm3, 48(%rdi)
-; AVX512BW-NEXT:    vmovntps %xmm2, 32(%rdi)
-; AVX512BW-NEXT:    vmovntps %xmm1, 16(%rdi)
-; AVX512BW-NEXT:    vmovntps %xmm0, (%rdi)
-; AVX512BW-NEXT:    movq %rbp, %rsp
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512-LABEL: test_zero_v64i8_align16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %xmm0, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, 32(%rdi)
+; AVX512-NEXT:    retq
   store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1
   ret void
 }
@@ -1267,18 +1343,9 @@ define void @test_zero_v8f64_align32(<8
 ;
 ; AVX512-LABEL: test_zero_v8f64_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %ymm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
 ; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1
@@ -1305,18 +1372,9 @@ define void @test_zero_v16f32_align32(<1
 ;
 ; AVX512-LABEL: test_zero_v16f32_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %ymm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
 ; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1
@@ -1343,18 +1401,9 @@ define void @test_zero_v8i64_align32(<8
 ;
 ; AVX512-LABEL: test_zero_v8i64_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %ymm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
 ; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1
@@ -1381,18 +1430,9 @@ define void @test_zero_v16i32_align32(<1
 ;
 ; AVX512-LABEL: test_zero_v16i32_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    movq %rsp, %rbp
-; AVX512-NEXT:    andq $-64, %rsp
-; AVX512-NEXT:    subq $128, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512-NEXT:    vmovaps (%rsp), %ymm0
-; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
 ; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512-NEXT:    movq %rbp, %rsp
-; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1
@@ -1417,30 +1457,13 @@ define void @test_zero_v32i16_align32(<3
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX512DQ-LABEL: test_zero_v32i16_align32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovntps %ymm0, 32(%rdi)
-; AVX512DQ-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_zero_v32i16_align32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    movq %rsp, %rbp
-; AVX512BW-NEXT:    andq $-64, %rsp
-; AVX512BW-NEXT:    subq $128, %rsp
-; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT:    vmovaps (%rsp), %ymm0
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512BW-NEXT:    vmovntps %ymm1, 32(%rdi)
-; AVX512BW-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512BW-NEXT:    movq %rbp, %rsp
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512-LABEL: test_zero_v32i16_align32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
+; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1
   ret void
 }
@@ -1463,30 +1486,13 @@ define void @test_zero_v64i8_align32(<64
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX512DQ-LABEL: test_zero_v64i8_align32:
-; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT:    vmovntps %ymm0, 32(%rdi)
-; AVX512DQ-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: test_zero_v64i8_align32:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    movq %rsp, %rbp
-; AVX512BW-NEXT:    andq $-64, %rsp
-; AVX512BW-NEXT:    subq $128, %rsp
-; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
-; AVX512BW-NEXT:    vmovaps (%rsp), %ymm0
-; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
-; AVX512BW-NEXT:    vmovntps %ymm1, 32(%rdi)
-; AVX512BW-NEXT:    vmovntps %ymm0, (%rdi)
-; AVX512BW-NEXT:    movq %rbp, %rsp
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX512-LABEL: test_zero_v64i8_align32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovntps %ymm0, 32(%rdi)
+; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1
   ret void
 }




More information about the llvm-commits mailing list