[llvm] r363564 - [X86][SSE] Prevent misaligned non-temporal vector load/store combines
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 17 07:26:10 PDT 2019
Author: rksimon
Date: Mon Jun 17 07:26:10 2019
New Revision: 363564
URL: http://llvm.org/viewvc/llvm-project?rev=363564&view=rev
Log:
[X86][SSE] Prevent misaligned non-temporal vector load/store combines
For loads, pre-SSE41 we can't perform NT loads at all, and after that we can only perform vector aligned loads, so if the alignment is less than for a xmm we'll just end up using the regular unaligned vector loads anyway.
First step towards fixing PR42026 - the next step for stores will be to use SSE4A movntsd where possible and to avoid the stack spill on SSE2 targets.
Differential Revision: https://reviews.llvm.org/D63246
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=363564&r1=363563&r2=363564&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jun 17 07:26:10 2019
@@ -2106,10 +2106,9 @@ bool X86TargetLowering::isSafeMemOpType(
return true;
}
-bool X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
- unsigned,
- MachineMemOperand::Flags,
- bool *Fast) const {
+bool X86TargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *Fast) const {
if (Fast) {
switch (VT.getSizeInBits()) {
default:
@@ -2125,6 +2124,16 @@ bool X86TargetLowering::allowsMisaligned
// TODO: What about AVX-512 (512-bit) accesses?
}
}
+ // NonTemporal vector memory ops must be aligned.
+ if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {
+ // NT loads can only be vector aligned, so if its less aligned than the
+ // minimum vector size (which we can split the vector down to), we might as
+ // well use a regular unaligned vector load.
+ // We don't have any NT loads pre-SSE41.
+ if (!!(Flags & MachineMemOperand::MOLoad))
+ return (Align < 16 || !Subtarget.hasSSE41());
+ return false;
+ }
// Misaligned accesses of any size are always allowed.
return true;
}
Modified: llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll?rev=363564&r1=363563&r2=363564&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/merge-consecutive-stores-nt.ll Mon Jun 17 07:26:10 2019
@@ -163,7 +163,7 @@ define void @merge_2_v4f32_align32_mix_n
ret void
}
-; FIXME: AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
+; AVX2 can't perform NT-load-ymm on 16-byte aligned memory.
; Must be kept seperate as VMOVNTDQA xmm.
define void @merge_2_v4f32_align16_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-LABEL: merge_2_v4f32_align16_ntload:
@@ -200,20 +200,13 @@ define void @merge_2_v4f32_align16_ntloa
; X64-SSE41-NEXT: movdqa %xmm1, 16(%rsi)
; X64-SSE41-NEXT: retq
;
-; X64-AVX1-LABEL: merge_2_v4f32_align16_ntload:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovntdqa (%rdi), %xmm0
-; X64-AVX1-NEXT: vmovntdqa 16(%rdi), %xmm1
-; X64-AVX1-NEXT: vmovdqa %xmm1, 16(%rsi)
-; X64-AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: merge_2_v4f32_align16_ntload:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovups %ymm0, (%rsi)
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
+; X64-AVX-LABEL: merge_2_v4f32_align16_ntload:
+; X64-AVX: # %bb.0:
+; X64-AVX-NEXT: vmovntdqa (%rdi), %xmm0
+; X64-AVX-NEXT: vmovntdqa 16(%rdi), %xmm1
+; X64-AVX-NEXT: vmovdqa %xmm0, (%rsi)
+; X64-AVX-NEXT: vmovdqa %xmm1, 16(%rsi)
+; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
%3 = load <4 x float>, <4 x float>* %a0, align 16, !nontemporal !0
@@ -225,7 +218,7 @@ define void @merge_2_v4f32_align16_ntloa
ret void
}
-; FIXME: AVX can't perform NT-store-ymm on 16-byte aligned memory.
+; AVX can't perform NT-store-ymm on 16-byte aligned memory.
; Must be kept seperate as VMOVNTPS xmm.
define void @merge_2_v4f32_align16_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-LABEL: merge_2_v4f32_align16_ntstore:
@@ -248,9 +241,10 @@ define void @merge_2_v4f32_align16_ntsto
;
; X64-AVX-LABEL: merge_2_v4f32_align16_ntstore:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
-; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
+; X64-AVX-NEXT: vmovaps 16(%rdi), %xmm1
+; X64-AVX-NEXT: vmovntps %xmm0, (%rsi)
+; X64-AVX-NEXT: vmovntps %xmm1, 16(%rsi)
; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
@@ -263,7 +257,7 @@ define void @merge_2_v4f32_align16_ntsto
ret void
}
-; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory.
+; Nothing can perform NT-load-vector on 1-byte aligned memory.
; Just perform regular loads.
define void @merge_2_v4f32_align1_ntload(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-LABEL: merge_2_v4f32_align1_ntload:
@@ -301,32 +295,71 @@ define void @merge_2_v4f32_align1_ntload
ret void
}
-; FIXME: Nothing can perform NT-store-vector on 1-byte aligned memory.
+; Nothing can perform NT-store-vector on 1-byte aligned memory.
; Must be scalarized to use MOVTNI/MOVNTSD.
define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-LABEL: merge_2_v4f32_align1_ntstore:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $48, %esp
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: movups (%ecx), %xmm0
; X86-NEXT: movups 16(%ecx), %xmm1
-; X86-NEXT: movups %xmm0, (%eax)
-; X86-NEXT: movups %xmm1, 16(%eax)
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 12(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 8(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movntil %edx, 4(%eax)
+; X86-NEXT: movntil %ecx, (%eax)
+; X86-NEXT: movaps %xmm1, (%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 28(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 24(%eax)
+; X86-NEXT: movl (%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movntil %edx, 20(%eax)
+; X86-NEXT: movntil %ecx, 16(%eax)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-SSE-LABEL: merge_2_v4f32_align1_ntstore:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movups (%rdi), %xmm0
; X64-SSE-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE-NEXT: movups %xmm0, (%rsi)
-; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT: movntiq %rcx, 8(%rsi)
+; X64-SSE-NEXT: movntiq %rax, (%rsi)
+; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT: movntiq %rcx, 24(%rsi)
+; X64-SSE-NEXT: movntiq %rax, 16(%rsi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: merge_2_v4f32_align1_ntstore:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
-; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1
+; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT: movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT: movntiq %rax, (%rsi)
+; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT: movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
@@ -339,32 +372,71 @@ define void @merge_2_v4f32_align1_ntstor
ret void
}
-; FIXME: Nothing can perform NT-load-vector on 1-byte aligned memory.
+; Nothing can perform NT-load-vector on 1-byte aligned memory.
; Just perform regular loads and scalarize NT-stores.
define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {
; X86-LABEL: merge_2_v4f32_align1:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $48, %esp
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: movups (%ecx), %xmm0
; X86-NEXT: movups 16(%ecx), %xmm1
-; X86-NEXT: movups %xmm0, (%eax)
-; X86-NEXT: movups %xmm1, 16(%eax)
+; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 12(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 8(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movntil %edx, 4(%eax)
+; X86-NEXT: movntil %ecx, (%eax)
+; X86-NEXT: movaps %xmm1, (%esp)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 28(%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movntil %ecx, 24(%eax)
+; X86-NEXT: movl (%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movntil %edx, 20(%eax)
+; X86-NEXT: movntil %ecx, 16(%eax)
+; X86-NEXT: movl %ebp, %esp
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-SSE-LABEL: merge_2_v4f32_align1:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movups (%rdi), %xmm0
; X64-SSE-NEXT: movups 16(%rdi), %xmm1
-; X64-SSE-NEXT: movups %xmm0, (%rsi)
-; X64-SSE-NEXT: movups %xmm1, 16(%rsi)
+; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT: movntiq %rcx, 8(%rsi)
+; X64-SSE-NEXT: movntiq %rax, (%rsi)
+; X64-SSE-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT: movntiq %rcx, 24(%rsi)
+; X64-SSE-NEXT: movntiq %rax, 16(%rsi)
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: merge_2_v4f32_align1:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: vmovups %ymm0, (%rsi)
-; X64-AVX-NEXT: vzeroupper
+; X64-AVX-NEXT: vmovups (%rdi), %xmm0
+; X64-AVX-NEXT: vmovups 16(%rdi), %xmm1
+; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT: movntiq %rcx, 8(%rsi)
+; X64-AVX-NEXT: movntiq %rax, (%rsi)
+; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT: movntiq %rcx, 24(%rsi)
+; X64-AVX-NEXT: movntiq %rax, 16(%rsi)
; X64-AVX-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %a0, i64 1, i64 0
%2 = bitcast float* %1 to <4 x float>*
Modified: llvm/trunk/test/CodeGen/X86/nontemporal-3.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-3.ll?rev=363564&r1=363563&r2=363564&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-3.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-3.ll Mon Jun 17 07:26:10 2019
@@ -15,19 +15,31 @@ define void @test_zero_v2f64_align1(<2 x
; SSE-LABEL: test_zero_v2f64_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v2f64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v2f64_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %xmm0, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <2 x double> zeroinitializer, <2 x double>* %dst, align 1, !nontemporal !1
ret void
@@ -37,19 +49,31 @@ define void @test_zero_v4f32_align1(<4 x
; SSE-LABEL: test_zero_v4f32_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4f32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4f32_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %xmm0, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x float> zeroinitializer, <4 x float>* %dst, align 1, !nontemporal !1
ret void
@@ -59,19 +83,31 @@ define void @test_zero_v2i64_align1(<2 x
; SSE-LABEL: test_zero_v2i64_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v2i64_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v2i64_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %xmm0, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 1, !nontemporal !1
ret void
@@ -81,19 +117,31 @@ define void @test_zero_v4i32_align1(<4 x
; SSE-LABEL: test_zero_v4i32_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4i32_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v4i32_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %xmm0, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 1, !nontemporal !1
ret void
@@ -103,19 +151,31 @@ define void @test_zero_v8i16_align1(<8 x
; SSE-LABEL: test_zero_v8i16_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i16_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v8i16_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %xmm0, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 1, !nontemporal !1
ret void
@@ -125,19 +185,31 @@ define void @test_zero_v16i8_align1(<16
; SSE-LABEL: test_zero_v16i8_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i8_align1:
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vmovups %xmm0, (%rdi)
+; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movntiq %rcx, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
; AVX-NEXT: retq
;
; AVX512-LABEL: test_zero_v16i8_align1:
; AVX512: # %bb.0:
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %xmm0, (%rdi)
+; AVX512-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
; AVX512-NEXT: retq
store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 1, !nontemporal !1
ret void
@@ -149,8 +221,16 @@ define void @test_zero_v4f64_align1(<4 x
; SSE-LABEL: test_zero_v4f64_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4f64_align1:
@@ -174,8 +254,16 @@ define void @test_zero_v8f32_align1(<8 x
; SSE-LABEL: test_zero_v8f32_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f32_align1:
@@ -199,8 +287,16 @@ define void @test_zero_v4i64_align1(<4 x
; SSE-LABEL: test_zero_v4i64_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v4i64_align1:
@@ -224,8 +320,16 @@ define void @test_zero_v8i32_align1(<8 x
; SSE-LABEL: test_zero_v8i32_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i32_align1:
@@ -249,8 +353,16 @@ define void @test_zero_v16i16_align1(<16
; SSE-LABEL: test_zero_v16i16_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i16_align1:
@@ -274,8 +386,16 @@ define void @test_zero_v32i8_align1(<32
; SSE-LABEL: test_zero_v32i8_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v32i8_align1:
@@ -451,10 +571,26 @@ define void @test_zero_v8f64_align1(<8 x
; SSE-LABEL: test_zero_v8f64_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 48(%rdi)
-; SSE-NEXT: movups %xmm0, 32(%rdi)
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8f64_align1:
@@ -467,8 +603,30 @@ define void @test_zero_v8f64_align1(<8 x
;
; AVX512-LABEL: test_zero_v8f64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movq (%rsp), %rax
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 1, !nontemporal !1
@@ -479,10 +637,26 @@ define void @test_zero_v16f32_align1(<16
; SSE-LABEL: test_zero_v16f32_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 48(%rdi)
-; SSE-NEXT: movups %xmm0, 32(%rdi)
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16f32_align1:
@@ -495,8 +669,30 @@ define void @test_zero_v16f32_align1(<16
;
; AVX512-LABEL: test_zero_v16f32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movq (%rsp), %rax
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 1, !nontemporal !1
@@ -507,10 +703,26 @@ define void @test_zero_v8i64_align1(<8 x
; SSE-LABEL: test_zero_v8i64_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 48(%rdi)
-; SSE-NEXT: movups %xmm0, 32(%rdi)
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v8i64_align1:
@@ -523,8 +735,30 @@ define void @test_zero_v8i64_align1(<8 x
;
; AVX512-LABEL: test_zero_v8i64_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movq (%rsp), %rax
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 1, !nontemporal !1
@@ -535,10 +769,26 @@ define void @test_zero_v16i32_align1(<16
; SSE-LABEL: test_zero_v16i32_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 48(%rdi)
-; SSE-NEXT: movups %xmm0, 32(%rdi)
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v16i32_align1:
@@ -551,8 +801,30 @@ define void @test_zero_v16i32_align1(<16
;
; AVX512-LABEL: test_zero_v16i32_align1:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movq (%rsp), %rax
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 1, !nontemporal !1
@@ -563,10 +835,26 @@ define void @test_zero_v32i16_align1(<32
; SSE-LABEL: test_zero_v32i16_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 48(%rdi)
-; SSE-NEXT: movups %xmm0, 32(%rdi)
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v32i16_align1:
@@ -587,8 +875,30 @@ define void @test_zero_v32i16_align1(<32
;
; AVX512BW-LABEL: test_zero_v32i16_align1:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $128, %rsp
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 56(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 48(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 40(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 32(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 24(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 16(%rdi)
+; AVX512BW-NEXT: movq (%rsp), %rax
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512BW-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512BW-NEXT: movntiq %rax, (%rdi)
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 1, !nontemporal !1
@@ -599,10 +909,26 @@ define void @test_zero_v64i8_align1(<64
; SSE-LABEL: test_zero_v64i8_align1:
; SSE: # %bb.0:
; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movups %xmm0, 48(%rdi)
-; SSE-NEXT: movups %xmm0, 32(%rdi)
-; SSE-NEXT: movups %xmm0, 16(%rdi)
-; SSE-NEXT: movups %xmm0, (%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 56(%rdi)
+; SSE-NEXT: movntiq %rax, 48(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 40(%rdi)
+; SSE-NEXT: movntiq %rax, 32(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 24(%rdi)
+; SSE-NEXT: movntiq %rax, 16(%rdi)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movntiq %rcx, 8(%rdi)
+; SSE-NEXT: movntiq %rax, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: test_zero_v64i8_align1:
@@ -623,8 +949,30 @@ define void @test_zero_v64i8_align1(<64
;
; AVX512BW-LABEL: test_zero_v64i8_align1:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $128, %rsp
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 56(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 48(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 40(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 32(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 24(%rdi)
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512BW-NEXT: movntiq %rax, 16(%rdi)
+; AVX512BW-NEXT: movq (%rsp), %rax
+; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512BW-NEXT: movntiq %rcx, 8(%rdi)
+; AVX512BW-NEXT: movntiq %rax, (%rdi)
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 1, !nontemporal !1
@@ -651,8 +999,22 @@ define void @test_zero_v8f64_align16(<8
;
; AVX512-LABEL: test_zero_v8f64_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %xmm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 16, !nontemporal !1
@@ -679,8 +1041,22 @@ define void @test_zero_v16f32_align16(<1
;
; AVX512-LABEL: test_zero_v16f32_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %xmm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 16, !nontemporal !1
@@ -707,8 +1083,22 @@ define void @test_zero_v8i64_align16(<8
;
; AVX512-LABEL: test_zero_v8i64_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %xmm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 16, !nontemporal !1
@@ -735,8 +1125,22 @@ define void @test_zero_v16i32_align16(<1
;
; AVX512-LABEL: test_zero_v16i32_align16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %xmm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512-NEXT: vmovntps %xmm3, 48(%rdi)
+; AVX512-NEXT: vmovntps %xmm2, 32(%rdi)
+; AVX512-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 16, !nontemporal !1
@@ -771,8 +1175,22 @@ define void @test_zero_v32i16_align16(<3
;
; AVX512BW-LABEL: test_zero_v32i16_align16:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $128, %rsp
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT: vmovaps (%rsp), %xmm0
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi)
+; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi)
+; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512BW-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 16, !nontemporal !1
@@ -807,8 +1225,22 @@ define void @test_zero_v64i8_align16(<64
;
; AVX512BW-LABEL: test_zero_v64i8_align16:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $128, %rsp
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT: vmovaps (%rsp), %xmm0
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm2
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm3
+; AVX512BW-NEXT: vmovntps %xmm3, 48(%rdi)
+; AVX512BW-NEXT: vmovntps %xmm2, 32(%rdi)
+; AVX512BW-NEXT: vmovntps %xmm1, 16(%rdi)
+; AVX512BW-NEXT: vmovntps %xmm0, (%rdi)
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 16, !nontemporal !1
@@ -835,8 +1267,18 @@ define void @test_zero_v8f64_align32(<8
;
; AVX512-LABEL: test_zero_v8f64_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %ymm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x double> zeroinitializer, <8 x double>* %dst, align 32, !nontemporal !1
@@ -863,8 +1305,18 @@ define void @test_zero_v16f32_align32(<1
;
; AVX512-LABEL: test_zero_v16f32_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %ymm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x float> zeroinitializer, <16 x float>* %dst, align 32, !nontemporal !1
@@ -891,8 +1343,18 @@ define void @test_zero_v8i64_align32(<8
;
; AVX512-LABEL: test_zero_v8i64_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %ymm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, <8 x i64>* %dst, align 32, !nontemporal !1
@@ -919,8 +1381,18 @@ define void @test_zero_v16i32_align32(<1
;
; AVX512-LABEL: test_zero_v16i32_align32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: vmovups %zmm0, (%rdi)
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: movq %rsp, %rbp
+; AVX512-NEXT: andq $-64, %rsp
+; AVX512-NEXT: subq $128, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512-NEXT: vmovaps (%rsp), %ymm0
+; AVX512-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512-NEXT: movq %rbp, %rsp
+; AVX512-NEXT: popq %rbp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, <16 x i32>* %dst, align 32, !nontemporal !1
@@ -955,8 +1427,18 @@ define void @test_zero_v32i16_align32(<3
;
; AVX512BW-LABEL: test_zero_v32i16_align32:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $128, %rsp
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT: vmovaps (%rsp), %ymm0
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512BW-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
store <32 x i16> zeroinitializer, <32 x i16>* %dst, align 32, !nontemporal !1
@@ -991,8 +1473,18 @@ define void @test_zero_v64i8_align32(<64
;
; AVX512BW-LABEL: test_zero_v64i8_align32:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: pushq %rbp
+; AVX512BW-NEXT: movq %rsp, %rbp
+; AVX512BW-NEXT: andq $-64, %rsp
+; AVX512BW-NEXT: subq $128, %rsp
; AVX512BW-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovups %zmm0, (%rdi)
+; AVX512BW-NEXT: vmovaps %zmm0, (%rsp)
+; AVX512BW-NEXT: vmovaps (%rsp), %ymm0
+; AVX512BW-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
+; AVX512BW-NEXT: vmovntps %ymm1, 32(%rdi)
+; AVX512BW-NEXT: vmovntps %ymm0, (%rdi)
+; AVX512BW-NEXT: movq %rbp, %rsp
+; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
store <64 x i8> zeroinitializer, <64 x i8>* %dst, align 32, !nontemporal !1
More information about the llvm-commits
mailing list