[llvm] 169db80 - [X86] Canonicalize fp zero vectors from bitcasted integer zero vectors

Thu Nov 30 10:34:34 PST 2023

Author: Simon Pilgrim
Date: 2023-11-30T18:33:52Z
New Revision: 169db80e41936811c6744f2c513a1ed00d97f10e

URL: https://github.com/llvm/llvm-project/commit/169db80e41936811c6744f2c513a1ed00d97f10e
DIFF: https://github.com/llvm/llvm-project/commit/169db80e41936811c6744f2c513a1ed00d97f10e.diff

LOG: [X86] Canonicalize fp zero vectors from bitcasted integer zero vectors

Generic code is supposed to handle this but can be blocked by hasOneUse checks.

Noticed while investigating #26392

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
    llvm/test/CodeGen/X86/2012-07-10-extload64.ll
    llvm/test/CodeGen/X86/fold-load-vec.ll
    llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
    llvm/test/CodeGen/X86/half.ll
    llvm/test/CodeGen/X86/nontemporal-3.ll
    llvm/test/CodeGen/X86/pr13577.ll
    llvm/test/CodeGen/X86/pr41619.ll
    llvm/test/CodeGen/X86/vec_zero_cse.ll
    llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6167be7bdf84e9f..b73779edc5f7270 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42930,6 +42930,12 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Canonicalize fp zero vectors - these sometimes don't fold due to one use
+  // limits.
+  if (VT.isVector() && TLI.isTypeLegal(VT) && ISD::isBuildVectorAllZeros(N) &&
+      (VT.getScalarType() == MVT::f32 || VT.getScalarType() == MVT::f64))
+    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N0));
+
   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
   // most of these to scalar anyway.
   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

diff  --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index e7f62b9dfc22196..d3f410d37567c36 100644
--- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -49,14 +49,12 @@ entry:
 define void @zero_test() {
 ; X86-LABEL: zero_test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    movlps %xmm0, (%eax)
+; X86-NEXT:    movl $0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: zero_test:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    movlps %xmm0, (%rax)
+; X64-NEXT:    movq $0, (%rax)
 ; X64-NEXT:    retq
 entry:
   %0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
index b6ec3b34eb1072d..f4ec8bde3700a9b 100644
--- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -29,8 +29,8 @@ define void @store_64(ptr %ptr) {
 ; X86-LABEL: store_64:
 ; X86:       # %bb.0: # %BB
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    movlps %xmm0, (%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store_64:

diff  --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll
index 348929cdf9f79e4..0bf846a0930bb4c 100644
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -10,8 +10,8 @@ define void @sample_test(ptr %source, ptr %dest) nounwind {
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, (%rsp)
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    movlps %xmm0, (%rsi)

diff  --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 88425ea87845dfe..713ecf5414bac5c 100644
--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -51,11 +51,6 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X32-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X32-NEXT:    mulps %xmm0, %xmm0
-; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X32-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
@@ -64,8 +59,10 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X32-NEXT:    cmpunordps %xmm0, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X32-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X32-NEXT:    minps %xmm0, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X32-NEXT:    xorps %xmm0, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -135,11 +132,6 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X64-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
 ; X64-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
-; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X64-NEXT:    mulps %xmm0, %xmm0
-; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; X64-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
 ; X64-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
@@ -148,8 +140,10 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X64-NEXT:    cmpunordps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X64-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    minps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; X64-NEXT:    xorl %ebx, %ebx
 ; X64-NEXT:    xorps %xmm3, %xmm3

diff  --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 596e465ee8cacf2..7225257203161b2 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1082,12 +1082,11 @@ define void @main.158() #0 {
 ; BWON-F16C-LABEL: main.158:
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; BWON-F16C-NEXT:    vucomiss %xmm0, %xmm1
-; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
+; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
+; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm2
 ; BWON-F16C-NEXT:    jae .LBB20_2
 ; BWON-F16C-NEXT:  # %bb.1: # %entry
 ; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1100,8 +1099,7 @@ define void @main.158() #0 {
 ; CHECK-I686-LABEL: main.158:
 ; CHECK-I686:       # %bb.0: # %entry
 ; CHECK-I686-NEXT:    subl $12, %esp
-; CHECK-I686-NEXT:    pxor %xmm0, %xmm0
-; CHECK-I686-NEXT:    movd %xmm0, (%esp)
+; CHECK-I686-NEXT:    movl $0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)

diff  --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll
index a2d2c5ca4301186..f9872b10097a150 100644
--- a/llvm/test/CodeGen/X86/nontemporal-3.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-3.ll
@@ -93,247 +93,66 @@ define void @test_zero_v4f64_align1(ptr %dst) nounwind {
 }
 
 define void @test_zero_v8f32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v8f32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v8f32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorl %eax, %eax
-; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v8f32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v8f32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v8f32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v8f32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <8 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v4i64_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v4i64_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v4i64_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v4i64_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v4i64_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v4i64_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v4i64_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <4 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v8i32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v8i32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v8i32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v8i32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v8i32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v8i32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v8i32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <8 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v16i16_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v16i16_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v16i16_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v16i16_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v16i16_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v16i16_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v16i16_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <16 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v32i8_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v32i8_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v32i8_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v32i8_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v32i8_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v32i8_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v32i8_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    retq
   store <32 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
@@ -508,347 +327,86 @@ define void @test_zero_v8f64_align1(ptr %dst) nounwind {
 }
 
 define void @test_zero_v16f32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v16f32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v16f32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorl %eax, %eax
-; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 40(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 56(%rdi)
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v16f32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v16f32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v16f32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v16f32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <16 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v8i64_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v8i64_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v8i64_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v8i64_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v8i64_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v8i64_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v8i64_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <8 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v16i32_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v16i32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v16i32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v16i32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v16i32_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v16i32_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v16i32_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <16 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v32i16_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v32i16_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v32i16_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v32i16_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v32i16_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v32i16_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v32i16_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <32 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v64i8_align1(ptr %dst) nounwind {
-; SSE2-LABEL: test_zero_v64i8_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v64i8_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v64i8_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: test_zero_v64i8_align1:
-; AVX:       # %bb.0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movntiq %rax, 8(%rdi)
-; AVX-NEXT:    movntiq %rax, (%rdi)
-; AVX-NEXT:    movntiq %rax, 24(%rdi)
-; AVX-NEXT:    movntiq %rax, 16(%rdi)
-; AVX-NEXT:    movntiq %rax, 40(%rdi)
-; AVX-NEXT:    movntiq %rax, 32(%rdi)
-; AVX-NEXT:    movntiq %rax, 56(%rdi)
-; AVX-NEXT:    movntiq %rax, 48(%rdi)
-; AVX-NEXT:    retq
-;
-; AVX512-LABEL: test_zero_v64i8_align1:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movntiq %rax, 8(%rdi)
-; AVX512-NEXT:    movntiq %rax, (%rdi)
-; AVX512-NEXT:    movntiq %rax, 24(%rdi)
-; AVX512-NEXT:    movntiq %rax, 16(%rdi)
-; AVX512-NEXT:    movntiq %rax, 40(%rdi)
-; AVX512-NEXT:    movntiq %rax, 32(%rdi)
-; AVX512-NEXT:    movntiq %rax, 56(%rdi)
-; AVX512-NEXT:    movntiq %rax, 48(%rdi)
-; AVX512-NEXT:    retq
+; CHECK-LABEL: test_zero_v64i8_align1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    movntiq %rax, 8(%rdi)
+; CHECK-NEXT:    movntiq %rax, (%rdi)
+; CHECK-NEXT:    movntiq %rax, 24(%rdi)
+; CHECK-NEXT:    movntiq %rax, 16(%rdi)
+; CHECK-NEXT:    movntiq %rax, 40(%rdi)
+; CHECK-NEXT:    movntiq %rax, 32(%rdi)
+; CHECK-NEXT:    movntiq %rax, 56(%rdi)
+; CHECK-NEXT:    movntiq %rax, 48(%rdi)
+; CHECK-NEXT:    retq
   store <64 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
@@ -1214,3 +772,7 @@ define void @test_zero_v64i8_align32(ptr %dst) nounwind {
 }
 
 !1 = !{i32 1}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE2: {{.*}}
+; SSE41: {{.*}}
+; SSE4A: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/pr13577.ll b/llvm/test/CodeGen/X86/pr13577.ll
index 7511560d85f5191..ef359e740c09d7c 100644
--- a/llvm/test/CodeGen/X86/pr13577.ll
+++ b/llvm/test/CodeGen/X86/pr13577.ll
@@ -29,7 +29,8 @@ declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
 define float @pr26070() {
 ; CHECK-LABEL: pr26070:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
   %c = call float @copysignf(float 1.0, float undef) readnone
   ret float %c

diff  --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll
index 7d1d139a38a520d..88dcd7798f0c3d2 100644
--- a/llvm/test/CodeGen/X86/pr41619.ll
+++ b/llvm/test/CodeGen/X86/pr41619.ll
@@ -7,10 +7,9 @@ define void @foo(double %arg) {
 ; CHECK:       ## %bb.0: ## %bb
 ; CHECK-NEXT:    vmovq %xmm0, %rax
 ; CHECK-NEXT:    vmovd %eax, %xmm0
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vmovq %xmm0, %rax
 ; CHECK-NEXT:    movl %eax, (%rax)
-; CHECK-NEXT:    vmovlps %xmm1, (%rax)
+; CHECK-NEXT:    movq $0, (%rax)
 ; CHECK-NEXT:    retq
 bb:
   %tmp = bitcast double %arg to i64

diff  --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll
index 99185277ba745d5..800dd59c262666d 100644
--- a/llvm/test/CodeGen/X86/vec_zero_cse.ll
+++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll
@@ -15,8 +15,8 @@ define void @test1() {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl $0, M1+4
 ; X32-NEXT:    movl $0, M1
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    movlps %xmm0, M2
+; X32-NEXT:    movl $0, M2+4
+; X32-NEXT:    movl $0, M2
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test1:

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 23c37af1db2f7c9..eb5fc1523c08a7d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -108,11 +108,10 @@ define void @PR46178(ptr %0) {
 ; X86-NEXT:    vmovdqu (%eax), %ymm1
 ; X86-NEXT:    vpmovqw %ymm0, %xmm0
 ; X86-NEXT:    vpmovqw %ymm1, %xmm1
-; X86-NEXT:    vpsllw $8, %xmm0, %xmm0
-; X86-NEXT:    vpsraw $8, %xmm0, %xmm0
-; X86-NEXT:    vpsllw $8, %xmm1, %xmm1
-; X86-NEXT:    vpsraw $8, %xmm1, %xmm1
-; X86-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; X86-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT:    vpsllw $8, %ymm0, %ymm0
+; X86-NEXT:    vpsraw $8, %ymm0, %ymm0
+; X86-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,1]
 ; X86-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl