[llvm] c12de14 - Revert "[X86] Canonicalize fp zero vectors from bitcasted integer zero vectors"

Thu Nov 30 12:00:33 PST 2023

Author: Douglas Yung
Date: 2023-11-30T11:59:50-08:00
New Revision: c12de1487670ab009e738b905ad8296a8cb2c685

URL: https://github.com/llvm/llvm-project/commit/c12de1487670ab009e738b905ad8296a8cb2c685
DIFF: https://github.com/llvm/llvm-project/commit/c12de1487670ab009e738b905ad8296a8cb2c685.diff

LOG: Revert "[X86] Canonicalize fp zero vectors from bitcasted integer zero vectors"

This reverts commit 169db80e41936811c6744f2c513a1ed00d97f10e.

This change is causing many test failures on Windows bots:
- https://lab.llvm.org/buildbot/#/builders/235/builds/3616
- https://lab.llvm.org/buildbot/#/builders/233/builds/4883
- https://lab.llvm.org/buildbot/#/builders/216/builds/31174

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
    llvm/test/CodeGen/X86/2012-07-10-extload64.ll
    llvm/test/CodeGen/X86/fold-load-vec.ll
    llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
    llvm/test/CodeGen/X86/half.ll
    llvm/test/CodeGen/X86/nontemporal-3.ll
    llvm/test/CodeGen/X86/pr13577.ll
    llvm/test/CodeGen/X86/pr41619.ll
    llvm/test/CodeGen/X86/vec_zero_cse.ll
    llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b73779edc5f7270..6167be7bdf84e9f 100644

--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42930,12 +42930,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Canonicalize fp zero vectors - these sometimes don't fold due to one use
-  // limits.
-  if (VT.isVector() && TLI.isTypeLegal(VT) && ISD::isBuildVectorAllZeros(N) &&
-      (VT.getScalarType() == MVT::f32 || VT.getScalarType() == MVT::f64))
-    return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N0));
-
   // Try to remove a bitcast of constant vXi1 vector. We have to legalize
   // most of these to scalar anyway.
   if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

diff  --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index d3f410d37567c36..e7f62b9dfc22196 100644
--- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -49,12 +49,14 @@ entry:
 define void @zero_test() {
 ; X86-LABEL: zero_test:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movlps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: zero_test:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movq $0, (%rax)
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movlps %xmm0, (%rax)
 ; X64-NEXT:    retq
 entry:
   %0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer

diff  --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
index f4ec8bde3700a9b..b6ec3b34eb1072d 100644
--- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -29,8 +29,8 @@ define void @store_64(ptr %ptr) {
 ; X86-LABEL: store_64:
 ; X86:       # %bb.0: # %BB
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $0, 4(%eax)
-; X86-NEXT:    movl $0, (%eax)
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    movlps %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store_64:

diff  --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll
index 0bf846a0930bb4c..348929cdf9f79e4 100644
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -10,8 +10,8 @@ define void @sample_test(ptr %source, ptr %dest) nounwind {
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movq $0, (%rsp)
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    movlps %xmm0, (%rsi)

diff  --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 713ecf5414bac5c..88425ea87845dfe 100644
--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -51,6 +51,11 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X32-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X32-NEXT:    mulps %xmm0, %xmm0
+; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X32-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
@@ -59,10 +64,8 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X32-NEXT:    cmpunordps %xmm0, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X32-NEXT:    minps %xmm0, %xmm0
+; X32-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
 ; X32-NEXT:    xorps %xmm0, %xmm0
 ; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -132,6 +135,11 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X64-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
 ; X64-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X64-NEXT:    mulps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; X64-NEXT:    movaps (%rsp), %xmm0 ## 16-byte Reload
 ; X64-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
@@ -140,10 +148,8 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
 ; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; X64-NEXT:    cmpunordps %xmm0, %xmm0
 ; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; X64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X64-NEXT:    minps %xmm0, %xmm0
+; X64-NEXT:    minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 ; X64-NEXT:    xorl %ebx, %ebx
 ; X64-NEXT:    xorps %xmm3, %xmm3

diff  --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 7225257203161b2..596e465ee8cacf2 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1082,11 +1082,12 @@ define void @main.158() #0 {
 ; BWON-F16C-LABEL: main.158:
 ; BWON-F16C:       # %bb.0: # %entry
 ; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm2
+; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BWON-F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; BWON-F16C-NEXT:    vucomiss %xmm0, %xmm1
+; BWON-F16C-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    jae .LBB20_2
 ; BWON-F16C-NEXT:  # %bb.1: # %entry
 ; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1099,7 +1100,8 @@ define void @main.158() #0 {
 ; CHECK-I686-LABEL: main.158:
 ; CHECK-I686:       # %bb.0: # %entry
 ; CHECK-I686-NEXT:    subl $12, %esp
-; CHECK-I686-NEXT:    movl $0, (%esp)
+; CHECK-I686-NEXT:    pxor %xmm0, %xmm0
+; CHECK-I686-NEXT:    movd %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
 ; CHECK-I686-NEXT:    pextrw $0, %xmm0, %eax
 ; CHECK-I686-NEXT:    movw %ax, (%esp)

diff  --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll
index f9872b10097a150..a2d2c5ca4301186 100644
--- a/llvm/test/CodeGen/X86/nontemporal-3.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-3.ll
@@ -93,66 +93,247 @@ define void @test_zero_v4f64_align1(ptr %dst) nounwind {
 }
 
 define void @test_zero_v8f32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v8f32_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v8f32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v8f32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorl %eax, %eax
+; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
+; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v8f32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v8f32_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v8f32_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    retq
   store <8 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v4i64_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v4i64_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v4i64_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v4i64_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v4i64_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v4i64_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v4i64_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    retq
   store <4 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v8i32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v8i32_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v8i32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v8i32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v8i32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v8i32_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v8i32_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    retq
   store <8 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v16i16_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v16i16_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v16i16_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v16i16_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v16i16_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v16i16_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v16i16_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    retq
   store <16 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v32i8_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v32i8_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v32i8_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v32i8_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v32i8_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v32i8_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v32i8_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    retq
   store <32 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
@@ -327,86 +508,347 @@ define void @test_zero_v8f64_align1(ptr %dst) nounwind {
 }
 
 define void @test_zero_v16f32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v16f32_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    movntiq %rax, 40(%rdi)
-; CHECK-NEXT:    movntiq %rax, 32(%rdi)
-; CHECK-NEXT:    movntiq %rax, 56(%rdi)
-; CHECK-NEXT:    movntiq %rax, 48(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v16f32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v16f32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorl %eax, %eax
+; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
+; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
+; SSE4A-NEXT:    movntiq %rax, 40(%rdi)
+; SSE4A-NEXT:    movntiq %rax, 56(%rdi)
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v16f32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v16f32_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v16f32_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    retq
   store <16 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v8i64_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v8i64_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    movntiq %rax, 40(%rdi)
-; CHECK-NEXT:    movntiq %rax, 32(%rdi)
-; CHECK-NEXT:    movntiq %rax, 56(%rdi)
-; CHECK-NEXT:    movntiq %rax, 48(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v8i64_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v8i64_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v8i64_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v8i64_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v8i64_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    retq
   store <8 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v16i32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v16i32_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    movntiq %rax, 40(%rdi)
-; CHECK-NEXT:    movntiq %rax, 32(%rdi)
-; CHECK-NEXT:    movntiq %rax, 56(%rdi)
-; CHECK-NEXT:    movntiq %rax, 48(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v16i32_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v16i32_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v16i32_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v16i32_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v16i32_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    retq
   store <16 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v32i16_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v32i16_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    movntiq %rax, 40(%rdi)
-; CHECK-NEXT:    movntiq %rax, 32(%rdi)
-; CHECK-NEXT:    movntiq %rax, 56(%rdi)
-; CHECK-NEXT:    movntiq %rax, 48(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v32i16_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v32i16_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v32i16_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v32i16_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v32i16_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    retq
   store <32 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
 
 define void @test_zero_v64i8_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v64i8_align1:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movntiq %rax, 8(%rdi)
-; CHECK-NEXT:    movntiq %rax, (%rdi)
-; CHECK-NEXT:    movntiq %rax, 24(%rdi)
-; CHECK-NEXT:    movntiq %rax, 16(%rdi)
-; CHECK-NEXT:    movntiq %rax, 40(%rdi)
-; CHECK-NEXT:    movntiq %rax, 32(%rdi)
-; CHECK-NEXT:    movntiq %rax, 56(%rdi)
-; CHECK-NEXT:    movntiq %rax, 48(%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_zero_v64i8_align1:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    movntiq %rax, 8(%rdi)
+; SSE2-NEXT:    movntiq %rax, (%rdi)
+; SSE2-NEXT:    movntiq %rax, 24(%rdi)
+; SSE2-NEXT:    movntiq %rax, 16(%rdi)
+; SSE2-NEXT:    movntiq %rax, 40(%rdi)
+; SSE2-NEXT:    movntiq %rax, 32(%rdi)
+; SSE2-NEXT:    movntiq %rax, 56(%rdi)
+; SSE2-NEXT:    movntiq %rax, 48(%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4A-LABEL: test_zero_v64i8_align1:
+; SSE4A:       # %bb.0:
+; SSE4A-NEXT:    xorps %xmm0, %xmm0
+; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT:    retq
+;
+; SSE41-LABEL: test_zero_v64i8_align1:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    movntiq %rax, 8(%rdi)
+; SSE41-NEXT:    movntiq %rax, (%rdi)
+; SSE41-NEXT:    movntiq %rax, 24(%rdi)
+; SSE41-NEXT:    movntiq %rax, 16(%rdi)
+; SSE41-NEXT:    movntiq %rax, 40(%rdi)
+; SSE41-NEXT:    movntiq %rax, 32(%rdi)
+; SSE41-NEXT:    movntiq %rax, 56(%rdi)
+; SSE41-NEXT:    movntiq %rax, 48(%rdi)
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: test_zero_v64i8_align1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    movntiq %rax, 8(%rdi)
+; AVX-NEXT:    movntiq %rax, (%rdi)
+; AVX-NEXT:    movntiq %rax, 24(%rdi)
+; AVX-NEXT:    movntiq %rax, 16(%rdi)
+; AVX-NEXT:    movntiq %rax, 40(%rdi)
+; AVX-NEXT:    movntiq %rax, 32(%rdi)
+; AVX-NEXT:    movntiq %rax, 56(%rdi)
+; AVX-NEXT:    movntiq %rax, 48(%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: test_zero_v64i8_align1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    xorl %eax, %eax
+; AVX512-NEXT:    movntiq %rax, 8(%rdi)
+; AVX512-NEXT:    movntiq %rax, (%rdi)
+; AVX512-NEXT:    movntiq %rax, 24(%rdi)
+; AVX512-NEXT:    movntiq %rax, 16(%rdi)
+; AVX512-NEXT:    movntiq %rax, 40(%rdi)
+; AVX512-NEXT:    movntiq %rax, 32(%rdi)
+; AVX512-NEXT:    movntiq %rax, 56(%rdi)
+; AVX512-NEXT:    movntiq %rax, 48(%rdi)
+; AVX512-NEXT:    retq
   store <64 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
   ret void
 }
@@ -772,7 +1214,3 @@ define void @test_zero_v64i8_align32(ptr %dst) nounwind {
 }
 
 !1 = !{i32 1}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE2: {{.*}}
-; SSE41: {{.*}}
-; SSE4A: {{.*}}

diff  --git a/llvm/test/CodeGen/X86/pr13577.ll b/llvm/test/CodeGen/X86/pr13577.ll
index ef359e740c09d7c..7511560d85f5191 100644
--- a/llvm/test/CodeGen/X86/pr13577.ll
+++ b/llvm/test/CodeGen/X86/pr13577.ll
@@ -29,8 +29,7 @@ declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
 define float @pr26070() {
 ; CHECK-LABEL: pr26070:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    retq
   %c = call float @copysignf(float 1.0, float undef) readnone
   ret float %c

diff  --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll
index 88dcd7798f0c3d2..7d1d139a38a520d 100644
--- a/llvm/test/CodeGen/X86/pr41619.ll
+++ b/llvm/test/CodeGen/X86/pr41619.ll
@@ -7,9 +7,10 @@ define void @foo(double %arg) {
 ; CHECK:       ## %bb.0: ## %bb
 ; CHECK-NEXT:    vmovq %xmm0, %rax
 ; CHECK-NEXT:    vmovd %eax, %xmm0
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vmovq %xmm0, %rax
 ; CHECK-NEXT:    movl %eax, (%rax)
-; CHECK-NEXT:    movq $0, (%rax)
+; CHECK-NEXT:    vmovlps %xmm1, (%rax)
 ; CHECK-NEXT:    retq
 bb:
   %tmp = bitcast double %arg to i64

diff  --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll
index 800dd59c262666d..99185277ba745d5 100644
--- a/llvm/test/CodeGen/X86/vec_zero_cse.ll
+++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll
@@ -15,8 +15,8 @@ define void @test1() {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl $0, M1+4
 ; X32-NEXT:    movl $0, M1
-; X32-NEXT:    movl $0, M2+4
-; X32-NEXT:    movl $0, M2
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movlps %xmm0, M2
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test1:

diff  --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index eb5fc1523c08a7d..23c37af1db2f7c9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -108,10 +108,11 @@ define void @PR46178(ptr %0) {
 ; X86-NEXT:    vmovdqu (%eax), %ymm1
 ; X86-NEXT:    vpmovqw %ymm0, %xmm0
 ; X86-NEXT:    vpmovqw %ymm1, %xmm1
-; X86-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-NEXT:    vpsllw $8, %ymm0, %ymm0
-; X86-NEXT:    vpsraw $8, %ymm0, %ymm0
-; X86-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,1]
+; X86-NEXT:    vpsllw $8, %xmm0, %xmm0
+; X86-NEXT:    vpsraw $8, %xmm0, %xmm0
+; X86-NEXT:    vpsllw $8, %xmm1, %xmm1
+; X86-NEXT:    vpsraw $8, %xmm1, %xmm1
+; X86-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; X86-NEXT:    vmovdqu %ymm0, (%eax)
 ; X86-NEXT:    vzeroupper
 ; X86-NEXT:    retl