[llvm] r363564 - [X86][SSE] Prevent misaligned non-temporal vector load/store combines

Mon Jun 17 07:26:10 PDT 2019

Author: rksimon
Date: Mon Jun 17 07:26:10 2019
New Revision: 363564

URL: http://llvm.org/viewvc/llvm-project?rev=363564&view=rev
Log:
[X86][SSE] Prevent misaligned non-temporal vector load/store combines

For loads, pre-SSE41 we can't perform NT loads at all, and after that we can only perform vector aligned loads, so if the alignment is less than for a xmm we'll just end up using the regular unaligned vector loads anyway.

First step towards fixing PR42026 - the next step for stores will be to use SSE4A movntsd where possible and to avoid the stack spill on SSE2 targets.

Differential Revision: https://reviews.llvm.org/D63246

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
    llvm/trunk/test/CodeGen/X86/nontemporal-3.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=363564&r1=363563&r2=363564&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jun 17 07:26:10 2019
@@ -2106,10 +2106,9 @@ bool X86TargetLowering::isSafeMemOpType(
   return true;
 }
 
-bool X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
-                                                       unsigned,
-                                                       MachineMemOperand::Flags,
-                                                       bool *Fast) const {
+bool X86TargetLowering::allowsMisalignedMemoryAccesses(
+    EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
+    bool *Fast) const {
   if (Fast) {
     switch (VT.getSizeInBits()) {
     default:
@@ -2125,6 +2124,16 @@ bool X86TargetLowering::allowsMisaligned
     // TODO: What about AVX-512 (512-bit) accesses?
     }
   }
+  // NonTemporal vector memory ops must be aligned.
+  if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
+    // NT loads can only be vector aligned, so if its less aligned than the
+    // minimum vector size (which we can split the vector down to), we might as
+    // well use a regular unaligned vector load.
+    // We don't have any NT loads pre-SSE41.
+    if (!!(Flags & MachineMemOperand::MOLoad))
+      return (Align < 16 || !Subtarget.hasSSE41());
+    return false;
+  }
   // Misaligned accesses of any size are always allowed.
   return true;
 }

Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll?rev=363564&r1=363563&r2=363564&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll Mon Jun 17 07:26:10 2019
@@ -163,7 +163,7 @@ define void @merge_2_v4f32_align32_mix_n
   ret void
 }
 
-; FIXME: AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
+; AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
 ; Must be kept seperate as VMOVNTDQA xmm.
 define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align16_ntload:
@@ -200,20 +200,13 @@ define void @merge_2_v4f32_align16_ntloa
 ; X64-SSE41-NEXT:    movdqa %xmm1, 16(%rsi)
 ; X64-SSE41-NEXT:    retq
 ;
-; X64-AVX1-LABEL: merge_2_v4f32_align16_ntload:
-; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovntdqa (%rdi), %xmm0
-; X64-AVX1-NEXT:    vmovntdqa 16(%rdi), %xmm1
-; X64-AVX1-NEXT:    vmovdqa %xmm1, 16(%rsi)
-; X64-AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; X64-AVX1-NEXT:    retq
-;
-; X64-AVX2-LABEL: merge_2_v4f32_align16_ntload:
-; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX2-NEXT:    vmovups %ymm0, (%rsi)
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
+; X64-AVX-LABEL: merge_2_v4f32_align16_ntload:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovntdqa (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovntdqa 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; X64-AVX-NEXT:    vmovdqa %xmm1, 16(%rsi)
+; X64-AVX-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
   %2 = bitcast float* %1 to <4 x float>*
   %3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0
@@ -225,7 +218,7 @@ define void @merge_2_v4f32_align16_ntloa
   ret void
 }
 
-; FIXME: AVX can't perform NT-store-ymm on 16-byte aligned memory.
+; AVX can't perform NT-store-ymm on 16-byte aligned memory.
 ; Must be kept seperate as VMOVNTPS xmm.
 define void @merge_2_v4f32_align16_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align16_ntstore:
@@ -248,9 +241,10 @@ define void @merge_2_v4f32_align16_ntsto
 ;
 ; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    vmovups %ymm0, (%rsi)
-; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovaps 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vmovntps %xmm0, (%rsi)
+; X64-AVX-NEXT:    vmovntps %xmm1, 16(%rsi)
 ; X64-AVX-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
   %2 = bitcast float* %1 to <4 x float>*
@@ -263,7 +257,7 @@ define void @merge_2_v4f32_align16_ntsto
   ret void
 }
 
-; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory.
+; Nothing can perform NT-load-vector on 1-byte aligned memory.
 ; Just perform regular loads.
 define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align1_ntload:
@@ -301,32 +295,71 @@ define void @merge_2_v4f32_align1_ntload
   ret void
 }
 
-; FIXME: Nothing can perform NT-store-vector on 1-byte aligned memory.
+; Nothing can perform NT-store-vector on 1-byte aligned memory.
 ; Must be scalarized to use MOVTNI/MOVNTSD.
 define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align1_ntstore:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
 ; X86-NEXT:    movups (%ecx), %xmm0
 ; X86-NEXT:    movups 16(%ecx), %xmm1
-; X86-NEXT:    movups %xmm0, (%eax)
-; X86-NEXT:    movups %xmm1, 16(%eax)
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movntil %edx, 4(%eax)
+; X86-NEXT:    movntil %ecx, (%eax)
+; X86-NEXT:    movaps %xmm1, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 28(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 24(%eax)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movntil %edx, 20(%eax)
+; X86-NEXT:    movntil %ecx, 16(%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: merge_2_v4f32_align1_ntstore:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE-NEXT:    movups %xmm0, (%rsi)
-; X64-SSE-NEXT:    movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT:    movntiq %rcx, 8(%rsi)
+; X64-SSE-NEXT:    movntiq %rax, (%rsi)
+; X64-SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT:    movntiq %rcx, 24(%rsi)
+; X64-SSE-NEXT:    movntiq %rax, 16(%rsi)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    vmovups %ymm0, (%rsi)
-; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovups 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT:    movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT:    movntiq %rax, (%rsi)
+; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT:    movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
 ; X64-AVX-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
   %2 = bitcast float* %1 to <4 x float>*
@@ -339,32 +372,71 @@ define void @merge_2_v4f32_align1_ntstor
   ret void
 }
 
-; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory.
+; Nothing can perform NT-load-vector on 1-byte aligned memory.
 ; Just perform regular loads and scalarize NT-stores.
 define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
 ; X86-LABEL: merge_2_v4f32_align1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $48, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
 ; X86-NEXT:    movups (%ecx), %xmm0
 ; X86-NEXT:    movups 16(%ecx), %xmm1
-; X86-NEXT:    movups %xmm0, (%eax)
-; X86-NEXT:    movups %xmm1, 16(%eax)
+; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 12(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 8(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movntil %edx, 4(%eax)
+; X86-NEXT:    movntil %ecx, (%eax)
+; X86-NEXT:    movaps %xmm1, (%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 28(%eax)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movntil %ecx, 24(%eax)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movntil %edx, 20(%eax)
+; X86-NEXT:    movntil %ecx, 16(%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-SSE-LABEL: merge_2_v4f32_align1:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movups (%rdi), %xmm0
 ; X64-SSE-NEXT:    movups 16(%rdi), %xmm1
-; X64-SSE-NEXT:    movups %xmm0, (%rsi)
-; X64-SSE-NEXT:    movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT:    movntiq %rcx, 8(%rsi)
+; X64-SSE-NEXT:    movntiq %rax, (%rsi)
+; X64-SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT:    movntiq %rcx, 24(%rsi)
+; X64-SSE-NEXT:    movntiq %rax, 16(%rsi)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: merge_2_v4f32_align1:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    vmovups %ymm0, (%rsi)
-; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    vmovups (%rdi), %xmm0
+; X64-AVX-NEXT:    vmovups 16(%rdi), %xmm1
+; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT:    movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT:    movntiq %rax, (%rsi)
+; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT:    movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT:    movntiq %rax, 16(%rsi)
 ; X64-AVX-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
   %2 = bitcast float* %1 to <4 x float>*

Modified: llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-3.ll?rev=363564&r1=363563&r2=363564&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-3.ll Mon Jun 17 07:26:10 2019
@@ -15,19 +15,31 @@ define void @test_zero_v2f64_align1(<2 x
 ; SSE-LABEL: test_zero_v2f64_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v2f64_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v2f64_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1
   ret void
@@ -37,19 +49,31 @@ define void @test_zero_v4f32_align1(<4 x
 ; SSE-LABEL: test_zero_v4f32_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4f32_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4f32_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1
   ret void
@@ -59,19 +83,31 @@ define void @test_zero_v2i64_align1(<2 x
 ; SSE-LABEL: test_zero_v2i64_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v2i64_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v2i64_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1
   ret void
@@ -81,19 +117,31 @@ define void @test_zero_v4i32_align1(<4 x
 ; SSE-LABEL: test_zero_v4i32_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4i32_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v4i32_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1
   ret void
@@ -103,19 +151,31 @@ define void @test_zero_v8i16_align1(<8 x
 ; SSE-LABEL: test_zero_v8i16_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i16_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v8i16_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1
   ret void
@@ -125,19 +185,31 @@ define void @test_zero_v16i8_align1(<16
 ; SSE-LABEL: test_zero_v16i8_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i8_align1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: test_zero_v16i8_align1:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
 ; AVX512-NEXT:    retq
   store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1
   ret void
@@ -149,8 +221,16 @@ define void @test_zero_v4f64_align1(<4 x
 ; SSE-LABEL: test_zero_v4f64_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4f64_align1:
@@ -174,8 +254,16 @@ define void @test_zero_v8f32_align1(<8 x
 ; SSE-LABEL: test_zero_v8f32_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8f32_align1:
@@ -199,8 +287,16 @@ define void @test_zero_v4i64_align1(<4 x
 ; SSE-LABEL: test_zero_v4i64_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4i64_align1:
@@ -224,8 +320,16 @@ define void @test_zero_v8i32_align1(<8 x
 ; SSE-LABEL: test_zero_v8i32_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i32_align1:
@@ -249,8 +353,16 @@ define void @test_zero_v16i16_align1(<16
 ; SSE-LABEL: test_zero_v16i16_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i16_align1:
@@ -274,8 +386,16 @@ define void @test_zero_v32i8_align1(<32
 ; SSE-LABEL: test_zero_v32i8_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i8_align1:
@@ -451,10 +571,26 @@ define void @test_zero_v8f64_align1(<8 x
 ; SSE-LABEL: test_zero_v8f64_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 48(%rdi)
-; SSE-NEXT:    movups %xmm0, 32(%rdi)
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8f64_align1:
@@ -467,8 +603,30 @@ define void @test_zero_v8f64_align1(<8 x
 ;
 ; AVX512-LABEL: test_zero_v8f64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movq (%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
@@ -479,10 +637,26 @@ define void @test_zero_v16f32_align1(<16
 ; SSE-LABEL: test_zero_v16f32_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 48(%rdi)
-; SSE-NEXT:    movups %xmm0, 32(%rdi)
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16f32_align1:
@@ -495,8 +669,30 @@ define void @test_zero_v16f32_align1(<16
 ;
 ; AVX512-LABEL: test_zero_v16f32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movq (%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
@@ -507,10 +703,26 @@ define void @test_zero_v8i64_align1(<8 x
 ; SSE-LABEL: test_zero_v8i64_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 48(%rdi)
-; SSE-NEXT:    movups %xmm0, 32(%rdi)
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i64_align1:
@@ -523,8 +735,30 @@ define void @test_zero_v8i64_align1(<8 x
 ;
 ; AVX512-LABEL: test_zero_v8i64_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movq (%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
@@ -535,10 +769,26 @@ define void @test_zero_v16i32_align1(<16
 ; SSE-LABEL: test_zero_v16i32_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 48(%rdi)
-; SSE-NEXT:    movups %xmm0, 32(%rdi)
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i32_align1:
@@ -551,8 +801,30 @@ define void @test_zero_v16i32_align1(<16
 ;
 ; AVX512-LABEL: test_zero_v16i32_align1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movq (%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
@@ -563,10 +835,26 @@ define void @test_zero_v32i16_align1(<32
 ; SSE-LABEL: test_zero_v32i16_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 48(%rdi)
-; SSE-NEXT:    movups %xmm0, 32(%rdi)
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i16_align1:
@@ -587,8 +875,30 @@ define void @test_zero_v32i16_align1(<32
 ;
 ; AVX512BW-LABEL: test_zero_v32i16_align1:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $128, %rsp
 ; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512BW-NEXT:    movq (%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512BW-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512BW-NEXT:    movntiq %rax, (%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
@@ -599,10 +909,26 @@ define void @test_zero_v64i8_align1(<64
 ; SSE-LABEL: test_zero_v64i8_align1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, 48(%rdi)
-; SSE-NEXT:    movups %xmm0, 32(%rdi)
-; SSE-NEXT:    movups %xmm0, 16(%rdi)
-; SSE-NEXT:    movups %xmm0, (%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movntiq %rcx, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v64i8_align1:
@@ -623,8 +949,30 @@ define void @test_zero_v64i8_align1(<64
 ;
 ; AVX512BW-LABEL: test_zero_v64i8_align1:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $128, %rsp
 ; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512BW-NEXT:    movq (%rsp), %rax
+; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512BW-NEXT:    movntiq %rcx, 8(%rdi)
+; AVX512BW-NEXT:    movntiq %rax, (%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1
@@ -651,8 +999,22 @@ define void @test_zero_v8f64_align16(<8
 ;
 ; AVX512-LABEL: test_zero_v8f64_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1
@@ -679,8 +1041,22 @@ define void @test_zero_v16f32_align16(<1
 ;
 ; AVX512-LABEL: test_zero_v16f32_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1
@@ -707,8 +1083,22 @@ define void @test_zero_v8i64_align16(<8
 ;
 ; AVX512-LABEL: test_zero_v8i64_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1
@@ -735,8 +1125,22 @@ define void @test_zero_v16i32_align16(<1
 ;
 ; AVX512-LABEL: test_zero_v16i32_align16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT:    vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT:    vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1
@@ -771,8 +1175,22 @@ define void @test_zero_v32i16_align16(<3
 ;
 ; AVX512BW-LABEL: test_zero_v32i16_align16:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $128, %rsp
 ; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512BW-NEXT:    vmovntps %xmm3, 48(%rdi)
+; AVX512BW-NEXT:    vmovntps %xmm2, 32(%rdi)
+; AVX512BW-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512BW-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1
@@ -807,8 +1225,22 @@ define void @test_zero_v64i8_align16(<64
 ;
 ; AVX512BW-LABEL: test_zero_v64i8_align16:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $128, %rsp
 ; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT:    vmovaps (%rsp), %xmm0
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512BW-NEXT:    vmovntps %xmm3, 48(%rdi)
+; AVX512BW-NEXT:    vmovntps %xmm2, 32(%rdi)
+; AVX512BW-NEXT:    vmovntps %xmm1, 16(%rdi)
+; AVX512BW-NEXT:    vmovntps %xmm0, (%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1
@@ -835,8 +1267,18 @@ define void @test_zero_v8f64_align32(<8
 ;
 ; AVX512-LABEL: test_zero_v8f64_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %ymm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1
@@ -863,8 +1305,18 @@ define void @test_zero_v16f32_align32(<1
 ;
 ; AVX512-LABEL: test_zero_v16f32_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %ymm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1
@@ -891,8 +1343,18 @@ define void @test_zero_v8i64_align32(<8
 ;
 ; AVX512-LABEL: test_zero_v8i64_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %ymm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1
@@ -919,8 +1381,18 @@ define void @test_zero_v16i32_align32(<1
 ;
 ; AVX512-LABEL: test_zero_v16i32_align32:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    movq %rsp, %rbp
+; AVX512-NEXT:    andq $-64, %rsp
+; AVX512-NEXT:    subq $128, %rsp
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512-NEXT:    vmovaps (%rsp), %ymm0
+; AVX512-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512-NEXT:    movq %rbp, %rsp
+; AVX512-NEXT:    popq %rbp
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1
@@ -955,8 +1427,18 @@ define void @test_zero_v32i16_align32(<3
 ;
 ; AVX512BW-LABEL: test_zero_v32i16_align32:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $128, %rsp
 ; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT:    vmovaps (%rsp), %ymm0
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512BW-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512BW-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1
@@ -991,8 +1473,18 @@ define void @test_zero_v64i8_align32(<64
 ;
 ; AVX512BW-LABEL: test_zero_v64i8_align32:
 ; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $128, %rsp
 ; AVX512BW-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT:    vmovaps (%rsp), %ymm0
+; AVX512BW-NEXT:    vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512BW-NEXT:    vmovntps %ymm1, 32(%rdi)
+; AVX512BW-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1