[llvm] 2aef332 - [X86] fast-isel-store.ll - cleanup check prefixes

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 29 10:00:46 PST 2024


Author: Simon Pilgrim
Date: 2024-01-29T18:00:35Z
New Revision: 2aef33230d3402878a837f9aaa37e37d0763d1ac

URL: https://github.com/llvm/llvm-project/commit/2aef33230d3402878a837f9aaa37e37d0763d1ac
DIFF: https://github.com/llvm/llvm-project/commit/2aef33230d3402878a837f9aaa37e37d0763d1ac.diff

LOG: [X86] fast-isel-store.ll - cleanup check prefixes

32/64-bit triples and check prefixes were inverted, and missing unwind attribute to strip cfi noise

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/fast-isel-store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/fast-isel-store.ll b/llvm/test/CodeGen/X86/fast-isel-store.ll
index 8472498ba41f93d..eba538d21339240 100644
--- a/llvm/test/CodeGen/X86/fast-isel-store.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-store.ll
@@ -1,771 +1,747 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL32,SSE32
-; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL64,SSE64
-; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVXONLY32
-; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVXONLY64
-; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232
-; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264
-; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232
-; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264
+; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X86,X86-SSE
+; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,X64-SSE
+; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
+; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512
+; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512
+; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512
+; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512
 
-define i32 @test_store_32(ptr nocapture %addr, i32 %value) {
-; ALL32-LABEL: test_store_32:
-; ALL32:       # %bb.0: # %entry
-; ALL32-NEXT:    movl %esi, %eax
-; ALL32-NEXT:    movl %esi, (%rdi)
-; ALL32-NEXT:    retq
-;
-; ALL64-LABEL: test_store_32:
-; ALL64:       # %bb.0: # %entry
-; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; ALL64-NEXT:    movl %eax, (%ecx)
-; ALL64-NEXT:    retl
+define i32 @test_store_32(ptr nocapture %addr, i32 %value) nounwind {
+; X86-LABEL: test_store_32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, (%ecx)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_store_32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movl %esi, (%rdi)
+; X64-NEXT:    retq
 entry:
   store i32 %value, ptr %addr, align 1
   ret i32 %value
 }
 
-define i16 @test_store_16(ptr nocapture %addr, i16 %value) {
-; ALL32-LABEL: test_store_16:
-; ALL32:       # %bb.0: # %entry
-; ALL32-NEXT:    movl %esi, %eax
-; ALL32-NEXT:    movw %ax, (%rdi)
-; ALL32-NEXT:    # kill: def $ax killed $ax killed $eax
-; ALL32-NEXT:    retq
-;
-; ALL64-LABEL: test_store_16:
-; ALL64:       # %bb.0: # %entry
-; ALL64-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; ALL64-NEXT:    movw %ax, (%ecx)
-; ALL64-NEXT:    retl
+define i16 @test_store_16(ptr nocapture %addr, i16 %value) nounwind {
+; X86-LABEL: test_store_16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_store_16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    movw %ax, (%rdi)
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
 entry:
   store i16 %value, ptr %addr, align 1
   ret i16 %value
 }
 
-define <4 x i32> @test_store_4xi32(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
-; SSE32-LABEL: test_store_4xi32:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    paddd %xmm1, %xmm0
-; SSE32-NEXT:    movdqu %xmm0, (%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_4xi32:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    paddd %xmm1, %xmm0
-; SSE64-NEXT:    movdqu %xmm0, (%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_4xi32:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX32-NEXT:    vmovdqu %xmm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_4xi32:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX64-NEXT:    vmovdqu %xmm0, (%eax)
-; AVX64-NEXT:    retl
+define <4 x i32> @test_store_4xi32(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) nounwind {
+; X86-SSE-LABEL: test_store_4xi32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    paddd %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqu %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_4xi32:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    paddd %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqu %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_4xi32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqu %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_4xi32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %foo = add <4 x i32> %value, %value2 ; to force integer type on store
   store <4 x i32> %foo, ptr %addr, align 1
   ret <4 x i32> %foo
 }
 
-define <4 x i32> @test_store_4xi32_aligned(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
-; SSE32-LABEL: test_store_4xi32_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    paddd %xmm1, %xmm0
-; SSE32-NEXT:    movdqa %xmm0, (%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_4xi32_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    paddd %xmm1, %xmm0
-; SSE64-NEXT:    movdqa %xmm0, (%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_4xi32_aligned:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX32-NEXT:    vmovdqa %xmm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_4xi32_aligned:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX64-NEXT:    vmovdqa %xmm0, (%eax)
-; AVX64-NEXT:    retl
+define <4 x i32> @test_store_4xi32_aligned(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) nounwind {
+; X86-SSE-LABEL: test_store_4xi32_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    paddd %xmm1, %xmm0
+; X86-SSE-NEXT:    movdqa %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_4xi32_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    paddd %xmm1, %xmm0
+; X64-SSE-NEXT:    movdqa %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_4xi32_aligned:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovdqa %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_4xi32_aligned:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %foo = add <4 x i32> %value, %value2 ; to force integer type on store
   store <4 x i32> %foo, ptr %addr, align 16
   ret <4 x i32> %foo
 }
 
-define <4 x float> @test_store_4xf32(ptr nocapture %addr, <4 x float> %value) {
-; SSE32-LABEL: test_store_4xf32:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movups %xmm0, (%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_4xf32:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movups %xmm0, (%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_4xf32:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vmovups %xmm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_4xf32:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vmovups %xmm0, (%eax)
-; AVX64-NEXT:    retl
+define <4 x float> @test_store_4xf32(ptr nocapture %addr, <4 x float> %value) nounwind {
+; X86-SSE-LABEL: test_store_4xf32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_4xf32:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movups %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_4xf32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_4xf32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   store <4 x float> %value, ptr %addr, align 1
   ret <4 x float> %value
 }
 
-define <4 x float> @test_store_4xf32_aligned(ptr nocapture %addr, <4 x float> %value) {
-; SSE32-LABEL: test_store_4xf32_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movaps %xmm0, (%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_4xf32_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movaps %xmm0, (%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_4xf32_aligned:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vmovaps %xmm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_4xf32_aligned:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vmovaps %xmm0, (%eax)
-; AVX64-NEXT:    retl
+define <4 x float> @test_store_4xf32_aligned(ptr nocapture %addr, <4 x float> %value) nounwind {
+; X86-SSE-LABEL: test_store_4xf32_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_4xf32_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_4xf32_aligned:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_4xf32_aligned:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   store <4 x float> %value, ptr %addr, align 16
   ret <4 x float> %value
 }
 
-define <2 x double> @test_store_2xf64(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) {
-; SSE32-LABEL: test_store_2xf64:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    addpd %xmm1, %xmm0
-; SSE32-NEXT:    movupd %xmm0, (%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_2xf64:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    addpd %xmm1, %xmm0
-; SSE64-NEXT:    movupd %xmm0, (%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_2xf64:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; AVX32-NEXT:    vmovupd %xmm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_2xf64:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; AVX64-NEXT:    vmovupd %xmm0, (%eax)
-; AVX64-NEXT:    retl
+define <2 x double> @test_store_2xf64(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) nounwind {
+; X86-SSE-LABEL: test_store_2xf64:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    addpd %xmm1, %xmm0
+; X86-SSE-NEXT:    movupd %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_2xf64:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addpd %xmm1, %xmm0
+; X64-SSE-NEXT:    movupd %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_2xf64:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovupd %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_2xf64:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovupd %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
   store <2 x double> %foo, ptr %addr, align 1
   ret <2 x double> %foo
 }
 
-define <2 x double> @test_store_2xf64_aligned(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) {
-; SSE32-LABEL: test_store_2xf64_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    addpd %xmm1, %xmm0
-; SSE32-NEXT:    movapd %xmm0, (%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_2xf64_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    addpd %xmm1, %xmm0
-; SSE64-NEXT:    movapd %xmm0, (%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_2xf64_aligned:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; AVX32-NEXT:    vmovapd %xmm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_2xf64_aligned:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
-; AVX64-NEXT:    vmovapd %xmm0, (%eax)
-; AVX64-NEXT:    retl
+define <2 x double> @test_store_2xf64_aligned(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) nounwind {
+; X86-SSE-LABEL: test_store_2xf64_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    addpd %xmm1, %xmm0
+; X86-SSE-NEXT:    movapd %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_2xf64_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addpd %xmm1, %xmm0
+; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_2xf64_aligned:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovapd %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_2xf64_aligned:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT:    vmovapd %xmm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
   store <2 x double> %foo, ptr %addr, align 16
   ret <2 x double> %foo
 }
 
-define <8 x i32> @test_store_8xi32(ptr nocapture %addr, <8 x i32> %value) {
-; SSE32-LABEL: test_store_8xi32:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movups %xmm0, (%rdi)
-; SSE32-NEXT:    movups %xmm1, 16(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_8xi32:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movups %xmm0, (%eax)
-; SSE64-NEXT:    movups %xmm1, 16(%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_8xi32:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vmovups %ymm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_8xi32:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vmovups %ymm0, (%eax)
-; AVX64-NEXT:    retl
+define <8 x i32> @test_store_8xi32(ptr nocapture %addr, <8 x i32> %value) nounwind {
+; X86-SSE-LABEL: test_store_8xi32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_8xi32:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movups %xmm0, (%rdi)
+; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_8xi32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_8xi32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups %ymm0, (%rdi)
+; X64-AVX-NEXT:    retq
   store <8 x i32> %value, ptr %addr, align 1
   ret <8 x i32> %value
 }
 
-define <8 x i32> @test_store_8xi32_aligned(ptr nocapture %addr, <8 x i32> %value) {
-; SSE32-LABEL: test_store_8xi32_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movaps %xmm0, (%rdi)
-; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_8xi32_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movaps %xmm0, (%eax)
-; SSE64-NEXT:    movaps %xmm1, 16(%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_8xi32_aligned:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_8xi32_aligned:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vmovaps %ymm0, (%eax)
-; AVX64-NEXT:    retl
+define <8 x i32> @test_store_8xi32_aligned(ptr nocapture %addr, <8 x i32> %value) nounwind {
+; X86-SSE-LABEL: test_store_8xi32_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_8xi32_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_8xi32_aligned:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_8xi32_aligned:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; X64-AVX-NEXT:    retq
   store <8 x i32> %value, ptr %addr, align 32
   ret <8 x i32> %value
 }
 
-define <8 x float> @test_store_8xf32(ptr nocapture %addr, <8 x float> %value) {
-; SSE32-LABEL: test_store_8xf32:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movups %xmm0, (%rdi)
-; SSE32-NEXT:    movups %xmm1, 16(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_8xf32:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movups %xmm0, (%eax)
-; SSE64-NEXT:    movups %xmm1, 16(%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_8xf32:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vmovups %ymm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_8xf32:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vmovups %ymm0, (%eax)
-; AVX64-NEXT:    retl
+define <8 x float> @test_store_8xf32(ptr nocapture %addr, <8 x float> %value) nounwind {
+; X86-SSE-LABEL: test_store_8xf32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_8xf32:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movups %xmm0, (%rdi)
+; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_8xf32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_8xf32:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovups %ymm0, (%rdi)
+; X64-AVX-NEXT:    retq
   store <8 x float> %value, ptr %addr, align 1
   ret <8 x float> %value
 }
 
-define <8 x float> @test_store_8xf32_aligned(ptr nocapture %addr, <8 x float> %value) {
-; SSE32-LABEL: test_store_8xf32_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movaps %xmm0, (%rdi)
-; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_8xf32_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movaps %xmm0, (%eax)
-; SSE64-NEXT:    movaps %xmm1, 16(%eax)
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_8xf32_aligned:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_8xf32_aligned:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vmovaps %ymm0, (%eax)
-; AVX64-NEXT:    retl
+define <8 x float> @test_store_8xf32_aligned(ptr nocapture %addr, <8 x float> %value) nounwind {
+; X86-SSE-LABEL: test_store_8xf32_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_8xf32_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_8xf32_aligned:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovaps %ymm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_8xf32_aligned:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; X64-AVX-NEXT:    retq
   store <8 x float> %value, ptr %addr, align 32
   ret <8 x float> %value
 }
 
-define <4 x double> @test_store_4xf64(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) {
-; SSE32-LABEL: test_store_4xf64:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    addpd %xmm2, %xmm0
-; SSE32-NEXT:    movupd %xmm0, (%rdi)
-; SSE32-NEXT:    addpd %xmm3, %xmm1
-; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_4xf64:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    addpd %xmm2, %xmm0
-; SSE64-NEXT:    movupd %xmm0, (%eax)
-; SSE64-NEXT:    addpd %xmm3, %xmm1
-; SSE64-NEXT:    movupd %xmm1, 16(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_4xf64:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX32-NEXT:    vmovupd %ymm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_4xf64:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX64-NEXT:    vmovupd %ymm0, (%eax)
-; AVX64-NEXT:    retl
+define <4 x double> @test_store_4xf64(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) nounwind {
+; X86-SSE-LABEL: test_store_4xf64:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    addpd %xmm2, %xmm0
+; X86-SSE-NEXT:    movupd %xmm0, (%eax)
+; X86-SSE-NEXT:    addpd %xmm3, %xmm1
+; X86-SSE-NEXT:    movupd %xmm1, 16(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_4xf64:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addpd %xmm2, %xmm0
+; X64-SSE-NEXT:    movupd %xmm0, (%rdi)
+; X64-SSE-NEXT:    addpd %xmm3, %xmm1
+; X64-SSE-NEXT:    movupd %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_4xf64:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; X86-AVX-NEXT:    vmovupd %ymm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_4xf64:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; X64-AVX-NEXT:    vmovupd %ymm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
   store <4 x double> %foo, ptr %addr, align 1
   ret <4 x double> %foo
 }
 
-define <4 x double> @test_store_4xf64_aligned(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) {
-; SSE32-LABEL: test_store_4xf64_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    addpd %xmm2, %xmm0
-; SSE32-NEXT:    movapd %xmm0, (%rdi)
-; SSE32-NEXT:    addpd %xmm3, %xmm1
-; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_4xf64_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    addpd %xmm2, %xmm0
-; SSE64-NEXT:    movapd %xmm0, (%eax)
-; SSE64-NEXT:    addpd %xmm3, %xmm1
-; SSE64-NEXT:    movapd %xmm1, 16(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVX32-LABEL: test_store_4xf64_aligned:
-; AVX32:       # %bb.0:
-; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX32-NEXT:    vmovapd %ymm0, (%rdi)
-; AVX32-NEXT:    retq
-;
-; AVX64-LABEL: test_store_4xf64_aligned:
-; AVX64:       # %bb.0:
-; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
-; AVX64-NEXT:    vmovapd %ymm0, (%eax)
-; AVX64-NEXT:    retl
+define <4 x double> @test_store_4xf64_aligned(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) nounwind {
+; X86-SSE-LABEL: test_store_4xf64_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    addpd %xmm2, %xmm0
+; X86-SSE-NEXT:    movapd %xmm0, (%eax)
+; X86-SSE-NEXT:    addpd %xmm3, %xmm1
+; X86-SSE-NEXT:    movapd %xmm1, 16(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_4xf64_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addpd %xmm2, %xmm0
+; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
+; X64-SSE-NEXT:    addpd %xmm3, %xmm1
+; X64-SSE-NEXT:    movapd %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX-LABEL: test_store_4xf64_aligned:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; X86-AVX-NEXT:    vmovapd %ymm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: test_store_4xf64_aligned:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; X64-AVX-NEXT:    vmovapd %ymm0, (%rdi)
+; X64-AVX-NEXT:    retq
   %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
   store <4 x double> %foo, ptr %addr, align 32
   ret <4 x double> %foo
 }
 
-define <16 x i32> @test_store_16xi32(ptr nocapture %addr, <16 x i32> %value) {
-; SSE32-LABEL: test_store_16xi32:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movups %xmm0, (%rdi)
-; SSE32-NEXT:    movups %xmm1, 16(%rdi)
-; SSE32-NEXT:    movups %xmm2, 32(%rdi)
-; SSE32-NEXT:    movups %xmm3, 48(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_16xi32:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movups %xmm0, (%eax)
-; SSE64-NEXT:    movups %xmm1, 16(%eax)
-; SSE64-NEXT:    movups %xmm2, 32(%eax)
-; SSE64-NEXT:    movups %xmm3, 48(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVXONLY32-LABEL: test_store_16xi32:
-; AVXONLY32:       # %bb.0:
-; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
-; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
-; AVXONLY32-NEXT:    retq
-;
-; AVXONLY64-LABEL: test_store_16xi32:
-; AVXONLY64:       # %bb.0:
-; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
-; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
-; AVXONLY64-NEXT:    retl
-;
-; AVX51232-LABEL: test_store_16xi32:
-; AVX51232:       # %bb.0:
-; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
-; AVX51232-NEXT:    retq
-;
-; AVX51264-LABEL: test_store_16xi32:
-; AVX51264:       # %bb.0:
-; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX51264-NEXT:    vmovups %zmm0, (%eax)
-; AVX51264-NEXT:    retl
+define <16 x i32> @test_store_16xi32(ptr nocapture %addr, <16 x i32> %value) nounwind {
+; X86-SSE-LABEL: test_store_16xi32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
+; X86-SSE-NEXT:    movups %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movups %xmm3, 48(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_16xi32:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movups %xmm0, (%rdi)
+; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    movups %xmm2, 32(%rdi)
+; X64-SSE-NEXT:    movups %xmm3, 48(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX1-LABEL: test_store_16xi32:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX1-NEXT:    vmovups %ymm1, 32(%eax)
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: test_store_16xi32:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; X64-AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X86-AVX512-LABEL: test_store_16xi32:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vmovups %zmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: test_store_16xi32:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   store <16 x i32> %value, ptr %addr, align 1
   ret <16 x i32> %value
 }
 
-define <16 x i32> @test_store_16xi32_aligned(ptr nocapture %addr, <16 x i32> %value) {
-; SSE32-LABEL: test_store_16xi32_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movaps %xmm0, (%rdi)
-; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
-; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
-; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_16xi32_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movaps %xmm0, (%eax)
-; SSE64-NEXT:    movaps %xmm1, 16(%eax)
-; SSE64-NEXT:    movaps %xmm2, 32(%eax)
-; SSE64-NEXT:    movaps %xmm3, 48(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVXONLY32-LABEL: test_store_16xi32_aligned:
-; AVXONLY32:       # %bb.0:
-; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
-; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
-; AVXONLY32-NEXT:    retq
-;
-; AVXONLY64-LABEL: test_store_16xi32_aligned:
-; AVXONLY64:       # %bb.0:
-; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
-; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
-; AVXONLY64-NEXT:    retl
-;
-; AVX51232-LABEL: test_store_16xi32_aligned:
-; AVX51232:       # %bb.0:
-; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
-; AVX51232-NEXT:    retq
-;
-; AVX51264-LABEL: test_store_16xi32_aligned:
-; AVX51264:       # %bb.0:
-; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
-; AVX51264-NEXT:    retl
+define <16 x i32> @test_store_16xi32_aligned(ptr nocapture %addr, <16 x i32> %value) nounwind {
+; X86-SSE-LABEL: test_store_16xi32_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
+; X86-SSE-NEXT:    movaps %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movaps %xmm3, 48(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_16xi32_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    movaps %xmm2, 32(%rdi)
+; X64-SSE-NEXT:    movaps %xmm3, 48(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX1-LABEL: test_store_16xi32_aligned:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
+; X86-AVX1-NEXT:    vmovaps %ymm1, 32(%eax)
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: test_store_16xi32_aligned:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovaps %ymm0, (%rdi)
+; X64-AVX1-NEXT:    vmovaps %ymm1, 32(%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X86-AVX512-LABEL: test_store_16xi32_aligned:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vmovaps %zmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: test_store_16xi32_aligned:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovaps %zmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   store <16 x i32> %value, ptr %addr, align 64
   ret <16 x i32> %value
 }
 
-define <16 x float> @test_store_16xf32(ptr nocapture %addr, <16 x float> %value) {
-; SSE32-LABEL: test_store_16xf32:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movups %xmm0, (%rdi)
-; SSE32-NEXT:    movups %xmm1, 16(%rdi)
-; SSE32-NEXT:    movups %xmm2, 32(%rdi)
-; SSE32-NEXT:    movups %xmm3, 48(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_16xf32:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movups %xmm0, (%eax)
-; SSE64-NEXT:    movups %xmm1, 16(%eax)
-; SSE64-NEXT:    movups %xmm2, 32(%eax)
-; SSE64-NEXT:    movups %xmm3, 48(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVXONLY32-LABEL: test_store_16xf32:
-; AVXONLY32:       # %bb.0:
-; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
-; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
-; AVXONLY32-NEXT:    retq
-;
-; AVXONLY64-LABEL: test_store_16xf32:
-; AVXONLY64:       # %bb.0:
-; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
-; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
-; AVXONLY64-NEXT:    retl
-;
-; AVX51232-LABEL: test_store_16xf32:
-; AVX51232:       # %bb.0:
-; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
-; AVX51232-NEXT:    retq
-;
-; AVX51264-LABEL: test_store_16xf32:
-; AVX51264:       # %bb.0:
-; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX51264-NEXT:    vmovups %zmm0, (%eax)
-; AVX51264-NEXT:    retl
+define <16 x float> @test_store_16xf32(ptr nocapture %addr, <16 x float> %value) nounwind {
+; X86-SSE-LABEL: test_store_16xf32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movups %xmm0, (%eax)
+; X86-SSE-NEXT:    movups %xmm1, 16(%eax)
+; X86-SSE-NEXT:    movups %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movups %xmm3, 48(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_16xf32:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movups %xmm0, (%rdi)
+; X64-SSE-NEXT:    movups %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    movups %xmm2, 32(%rdi)
+; X64-SSE-NEXT:    movups %xmm3, 48(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX1-LABEL: test_store_16xf32:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovups %ymm0, (%eax)
+; X86-AVX1-NEXT:    vmovups %ymm1, 32(%eax)
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: test_store_16xf32:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovups %ymm0, (%rdi)
+; X64-AVX1-NEXT:    vmovups %ymm1, 32(%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X86-AVX512-LABEL: test_store_16xf32:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vmovups %zmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: test_store_16xf32:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   store <16 x float> %value, ptr %addr, align 1
   ret <16 x float> %value
 }
 
-define <16 x float> @test_store_16xf32_aligned(ptr nocapture %addr, <16 x float> %value) {
-; SSE32-LABEL: test_store_16xf32_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    movaps %xmm0, (%rdi)
-; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
-; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
-; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_16xf32_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    movaps %xmm0, (%eax)
-; SSE64-NEXT:    movaps %xmm1, 16(%eax)
-; SSE64-NEXT:    movaps %xmm2, 32(%eax)
-; SSE64-NEXT:    movaps %xmm3, 48(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVXONLY32-LABEL: test_store_16xf32_aligned:
-; AVXONLY32:       # %bb.0:
-; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
-; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
-; AVXONLY32-NEXT:    retq
-;
-; AVXONLY64-LABEL: test_store_16xf32_aligned:
-; AVXONLY64:       # %bb.0:
-; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
-; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
-; AVXONLY64-NEXT:    retl
-;
-; AVX51232-LABEL: test_store_16xf32_aligned:
-; AVX51232:       # %bb.0:
-; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
-; AVX51232-NEXT:    retq
-;
-; AVX51264-LABEL: test_store_16xf32_aligned:
-; AVX51264:       # %bb.0:
-; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
-; AVX51264-NEXT:    retl
+define <16 x float> @test_store_16xf32_aligned(ptr nocapture %addr, <16 x float> %value) nounwind {
+; X86-SSE-LABEL: test_store_16xf32_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    movaps %xmm1, 16(%eax)
+; X86-SSE-NEXT:    movaps %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movaps %xmm3, 48(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_16xf32_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    movaps %xmm0, (%rdi)
+; X64-SSE-NEXT:    movaps %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    movaps %xmm2, 32(%rdi)
+; X64-SSE-NEXT:    movaps %xmm3, 48(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX1-LABEL: test_store_16xf32_aligned:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
+; X86-AVX1-NEXT:    vmovaps %ymm1, 32(%eax)
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: test_store_16xf32_aligned:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vmovaps %ymm0, (%rdi)
+; X64-AVX1-NEXT:    vmovaps %ymm1, 32(%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X86-AVX512-LABEL: test_store_16xf32_aligned:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vmovaps %zmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: test_store_16xf32_aligned:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vmovaps %zmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   store <16 x float> %value, ptr %addr, align 64
   ret <16 x float> %value
 }
 
-define <8 x double> @test_store_8xf64(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) {
-; SSE32-LABEL: test_store_8xf64:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    addpd %xmm4, %xmm0
-; SSE32-NEXT:    movupd %xmm0, (%rdi)
-; SSE32-NEXT:    addpd %xmm5, %xmm1
-; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
-; SSE32-NEXT:    addpd %xmm6, %xmm2
-; SSE32-NEXT:    movupd %xmm2, 32(%rdi)
-; SSE32-NEXT:    addpd %xmm7, %xmm3
-; SSE32-NEXT:    movupd %xmm3, 48(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_8xf64:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    addpd %xmm4, %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
-; SSE64-NEXT:    movupd %xmm0, (%eax)
-; SSE64-NEXT:    addpd %xmm6, %xmm1
-; SSE64-NEXT:    movupd %xmm1, 16(%eax)
-; SSE64-NEXT:    addpd %xmm5, %xmm2
-; SSE64-NEXT:    movupd %xmm2, 32(%eax)
-; SSE64-NEXT:    movupd %xmm3, 48(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVXONLY32-LABEL: test_store_8xf64:
-; AVXONLY32:       # %bb.0:
-; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVXONLY32-NEXT:    vmovupd %ymm0, (%rdi)
-; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
-; AVXONLY32-NEXT:    vmovupd %ymm1, 32(%rdi)
-; AVXONLY32-NEXT:    retq
-;
-; AVXONLY64-LABEL: test_store_8xf64:
-; AVXONLY64:       # %bb.0:
-; AVXONLY64-NEXT:    pushl %ebp
-; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
-; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
-; AVXONLY64-NEXT:    movl %esp, %ebp
-; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
-; AVXONLY64-NEXT:    andl $-32, %esp
-; AVXONLY64-NEXT:    subl $32, %esp
-; AVXONLY64-NEXT:    vmovapd 40(%ebp), %ymm3
-; AVXONLY64-NEXT:    movl 8(%ebp), %eax
-; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVXONLY64-NEXT:    vmovupd %ymm0, (%eax)
-; AVXONLY64-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
-; AVXONLY64-NEXT:    vmovupd %ymm1, 32(%eax)
-; AVXONLY64-NEXT:    movl %ebp, %esp
-; AVXONLY64-NEXT:    popl %ebp
-; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
-; AVXONLY64-NEXT:    retl
-;
-; AVX51232-LABEL: test_store_8xf64:
-; AVX51232:       # %bb.0:
-; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX51232-NEXT:    vmovupd %zmm0, (%rdi)
-; AVX51232-NEXT:    retq
-;
-; AVX51264-LABEL: test_store_8xf64:
-; AVX51264:       # %bb.0:
-; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX51264-NEXT:    vmovupd %zmm0, (%eax)
-; AVX51264-NEXT:    retl
+define <8 x double> @test_store_8xf64(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) nounwind {
+; X86-SSE-LABEL: test_store_8xf64:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    addpd %xmm4, %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movupd %xmm0, (%eax)
+; X86-SSE-NEXT:    addpd %xmm6, %xmm1
+; X86-SSE-NEXT:    movupd %xmm1, 16(%eax)
+; X86-SSE-NEXT:    addpd %xmm5, %xmm2
+; X86-SSE-NEXT:    movupd %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movupd %xmm3, 48(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_8xf64:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addpd %xmm4, %xmm0
+; X64-SSE-NEXT:    movupd %xmm0, (%rdi)
+; X64-SSE-NEXT:    addpd %xmm5, %xmm1
+; X64-SSE-NEXT:    movupd %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    addpd %xmm6, %xmm2
+; X64-SSE-NEXT:    movupd %xmm2, 32(%rdi)
+; X64-SSE-NEXT:    addpd %xmm7, %xmm3
+; X64-SSE-NEXT:    movupd %xmm3, 48(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX1-LABEL: test_store_8xf64:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    pushl %ebp
+; X86-AVX1-NEXT:    movl %esp, %ebp
+; X86-AVX1-NEXT:    andl $-32, %esp
+; X86-AVX1-NEXT:    subl $32, %esp
+; X86-AVX1-NEXT:    vmovapd 40(%ebp), %ymm3
+; X86-AVX1-NEXT:    movl 8(%ebp), %eax
+; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovupd %ymm0, (%eax)
+; X86-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vmovupd %ymm1, 32(%eax)
+; X86-AVX1-NEXT:    movl %ebp, %esp
+; X86-AVX1-NEXT:    popl %ebp
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: test_store_8xf64:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovupd %ymm0, (%rdi)
+; X64-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vmovupd %ymm1, 32(%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X86-AVX512-LABEL: test_store_8xf64:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; X86-AVX512-NEXT:    vmovupd %zmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: test_store_8xf64:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovupd %zmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
   store <8 x double> %foo, ptr %addr, align 1
   ret <8 x double> %foo
 }
 
-define <8 x double> @test_store_8xf64_aligned(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) {
-; SSE32-LABEL: test_store_8xf64_aligned:
-; SSE32:       # %bb.0:
-; SSE32-NEXT:    addpd %xmm4, %xmm0
-; SSE32-NEXT:    movapd %xmm0, (%rdi)
-; SSE32-NEXT:    addpd %xmm5, %xmm1
-; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
-; SSE32-NEXT:    addpd %xmm6, %xmm2
-; SSE32-NEXT:    movapd %xmm2, 32(%rdi)
-; SSE32-NEXT:    addpd %xmm7, %xmm3
-; SSE32-NEXT:    movapd %xmm3, 48(%rdi)
-; SSE32-NEXT:    retq
-;
-; SSE64-LABEL: test_store_8xf64_aligned:
-; SSE64:       # %bb.0:
-; SSE64-NEXT:    subl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 16
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
-; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
-; SSE64-NEXT:    addpd %xmm4, %xmm3
-; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
-; SSE64-NEXT:    movapd %xmm0, (%eax)
-; SSE64-NEXT:    addpd %xmm6, %xmm1
-; SSE64-NEXT:    movapd %xmm1, 16(%eax)
-; SSE64-NEXT:    addpd %xmm5, %xmm2
-; SSE64-NEXT:    movapd %xmm2, 32(%eax)
-; SSE64-NEXT:    movapd %xmm3, 48(%eax)
-; SSE64-NEXT:    addl $12, %esp
-; SSE64-NEXT:    .cfi_def_cfa_offset 4
-; SSE64-NEXT:    retl
-;
-; AVXONLY32-LABEL: test_store_8xf64_aligned:
-; AVXONLY32:       # %bb.0:
-; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVXONLY32-NEXT:    vmovapd %ymm0, (%rdi)
-; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
-; AVXONLY32-NEXT:    vmovapd %ymm1, 32(%rdi)
-; AVXONLY32-NEXT:    retq
-;
-; AVXONLY64-LABEL: test_store_8xf64_aligned:
-; AVXONLY64:       # %bb.0:
-; AVXONLY64-NEXT:    pushl %ebp
-; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
-; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
-; AVXONLY64-NEXT:    movl %esp, %ebp
-; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
-; AVXONLY64-NEXT:    andl $-32, %esp
-; AVXONLY64-NEXT:    subl $32, %esp
-; AVXONLY64-NEXT:    vmovapd 40(%ebp), %ymm3
-; AVXONLY64-NEXT:    movl 8(%ebp), %eax
-; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
-; AVXONLY64-NEXT:    vmovapd %ymm0, (%eax)
-; AVXONLY64-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
-; AVXONLY64-NEXT:    vmovapd %ymm1, 32(%eax)
-; AVXONLY64-NEXT:    movl %ebp, %esp
-; AVXONLY64-NEXT:    popl %ebp
-; AVXONLY64-NEXT:    .cfi_def_cfa %esp, 4
-; AVXONLY64-NEXT:    retl
-;
-; AVX51232-LABEL: test_store_8xf64_aligned:
-; AVX51232:       # %bb.0:
-; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX51232-NEXT:    vmovapd %zmm0, (%rdi)
-; AVX51232-NEXT:    retq
-;
-; AVX51264-LABEL: test_store_8xf64_aligned:
-; AVX51264:       # %bb.0:
-; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
-; AVX51264-NEXT:    vmovapd %zmm0, (%eax)
-; AVX51264-NEXT:    retl
+define <8 x double> @test_store_8xf64_aligned(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) nounwind {
+; X86-SSE-LABEL: test_store_8xf64_aligned:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm4
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm5
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm6
+; X86-SSE-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
+; X86-SSE-NEXT:    addpd %xmm4, %xmm3
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
+; X86-SSE-NEXT:    movapd %xmm0, (%eax)
+; X86-SSE-NEXT:    addpd %xmm6, %xmm1
+; X86-SSE-NEXT:    movapd %xmm1, 16(%eax)
+; X86-SSE-NEXT:    addpd %xmm5, %xmm2
+; X86-SSE-NEXT:    movapd %xmm2, 32(%eax)
+; X86-SSE-NEXT:    movapd %xmm3, 48(%eax)
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: test_store_8xf64_aligned:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    addpd %xmm4, %xmm0
+; X64-SSE-NEXT:    movapd %xmm0, (%rdi)
+; X64-SSE-NEXT:    addpd %xmm5, %xmm1
+; X64-SSE-NEXT:    movapd %xmm1, 16(%rdi)
+; X64-SSE-NEXT:    addpd %xmm6, %xmm2
+; X64-SSE-NEXT:    movapd %xmm2, 32(%rdi)
+; X64-SSE-NEXT:    addpd %xmm7, %xmm3
+; X64-SSE-NEXT:    movapd %xmm3, 48(%rdi)
+; X64-SSE-NEXT:    retq
+;
+; X86-AVX1-LABEL: test_store_8xf64_aligned:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    pushl %ebp
+; X86-AVX1-NEXT:    movl %esp, %ebp
+; X86-AVX1-NEXT:    andl $-32, %esp
+; X86-AVX1-NEXT:    subl $32, %esp
+; X86-AVX1-NEXT:    vmovapd 40(%ebp), %ymm3
+; X86-AVX1-NEXT:    movl 8(%ebp), %eax
+; X86-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X86-AVX1-NEXT:    vmovapd %ymm0, (%eax)
+; X86-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
+; X86-AVX1-NEXT:    vmovapd %ymm1, 32(%eax)
+; X86-AVX1-NEXT:    movl %ebp, %esp
+; X86-AVX1-NEXT:    popl %ebp
+; X86-AVX1-NEXT:    retl
+;
+; X64-AVX1-LABEL: test_store_8xf64_aligned:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT:    vmovapd %ymm0, (%rdi)
+; X64-AVX1-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
+; X64-AVX1-NEXT:    vmovapd %ymm1, 32(%rdi)
+; X64-AVX1-NEXT:    retq
+;
+; X86-AVX512-LABEL: test_store_8xf64_aligned:
+; X86-AVX512:       # %bb.0:
+; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; X86-AVX512-NEXT:    vmovapd %zmm0, (%eax)
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: test_store_8xf64_aligned:
+; X64-AVX512:       # %bb.0:
+; X64-AVX512-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vmovapd %zmm0, (%rdi)
+; X64-AVX512-NEXT:    retq
   %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
   store <8 x double> %foo, ptr %addr, align 64
   ret <8 x double> %foo


        


More information about the llvm-commits mailing list