[llvm] c12de14 - Revert "[X86] Canonicalize fp zero vectors from bitcasted integer zero vectors"
Douglas Yung via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 30 12:00:33 PST 2023
Author: Douglas Yung
Date: 2023-11-30T11:59:50-08:00
New Revision: c12de1487670ab009e738b905ad8296a8cb2c685
URL: https://github.com/llvm/llvm-project/commit/c12de1487670ab009e738b905ad8296a8cb2c685
DIFF: https://github.com/llvm/llvm-project/commit/c12de1487670ab009e738b905ad8296a8cb2c685.diff
LOG: Revert "[X86] Canonicalize fp zero vectors from bitcasted integer zero vectors"
This reverts commit 169db80e41936811c6744f2c513a1ed00d97f10e.
This change is causing many test failures on Windows bots:
- https://lab.llvm.org/buildbot/#/builders/235/builds/3616
- https://lab.llvm.org/buildbot/#/builders/233/builds/4883
- https://lab.llvm.org/buildbot/#/builders/216/builds/31174
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
llvm/test/CodeGen/X86/2012-07-10-extload64.ll
llvm/test/CodeGen/X86/fold-load-vec.ll
llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
llvm/test/CodeGen/X86/half.ll
llvm/test/CodeGen/X86/nontemporal-3.ll
llvm/test/CodeGen/X86/pr13577.ll
llvm/test/CodeGen/X86/pr41619.ll
llvm/test/CodeGen/X86/vec_zero_cse.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b73779edc5f7270..6167be7bdf84e9f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42930,12 +42930,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
}
}
- // Canonicalize fp zero vectors - these sometimes don't fold due to one use
- // limits.
- if (VT.isVector() && TLI.isTypeLegal(VT) && ISD::isBuildVectorAllZeros(N) &&
- (VT.getScalarType() == MVT::f32 || VT.getScalarType() == MVT::f64))
- return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N0));
-
// Try to remove a bitcast of constant vXi1 vector. We have to legalize
// most of these to scalar anyway.
if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index d3f410d37567c36..e7f62b9dfc22196 100644
--- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -49,12 +49,14 @@ entry:
define void @zero_test() {
; X86-LABEL: zero_test:
; X86: # %bb.0: # %entry
-; X86-NEXT: movl $0, (%eax)
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movlps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: zero_test:
; X64: # %bb.0: # %entry
-; X64-NEXT: movq $0, (%rax)
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movlps %xmm0, (%rax)
; X64-NEXT: retq
entry:
%0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
index f4ec8bde3700a9b..b6ec3b34eb1072d 100644
--- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -29,8 +29,8 @@ define void @store_64(ptr %ptr) {
; X86-LABEL: store_64:
; X86: # %bb.0: # %BB
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl $0, 4(%eax)
-; X86-NEXT: movl $0, (%eax)
+; X86-NEXT: xorps %xmm0, %xmm0
+; X86-NEXT: movlps %xmm0, (%eax)
; X86-NEXT: retl
;
; X64-LABEL: store_64:
diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll
index 0bf846a0930bb4c..348929cdf9f79e4 100644
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -10,8 +10,8 @@ define void @sample_test(ptr %source, ptr %dest) nounwind {
; CHECK-NEXT: subq $24, %rsp
; CHECK-NEXT: movq %rdi, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movq %rsi, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: movq $0, (%rsp)
; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movlps %xmm0, (%rsp)
; CHECK-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; CHECK-NEXT: movlps %xmm0, (%rsp)
; CHECK-NEXT: movlps %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
index 713ecf5414bac5c..88425ea87845dfe 100644
--- a/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
+++ b/llvm/test/CodeGen/X86/fold-pcmpeqd-2.ll
@@ -51,6 +51,11 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; X32-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
+; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X32-NEXT: mulps %xmm0, %xmm0
+; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; X32-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
@@ -59,10 +64,8 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; X32-NEXT: cmpunordps %xmm0, %xmm0
; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
-; X32-NEXT: xorps %xmm0, %xmm0
-; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; X32-NEXT: movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X32-NEXT: minps %xmm0, %xmm0
+; X32-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
; X32-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; X32-NEXT: xorps %xmm0, %xmm0
; X32-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
@@ -132,6 +135,11 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
; X64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
+; X64-NEXT: xorps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
+; X64-NEXT: mulps %xmm0, %xmm0
+; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; X64-NEXT: movaps (%rsp), %xmm0 ## 16-byte Reload
; X64-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill
@@ -140,10 +148,8 @@ define void @program_1(ptr %dest, ptr %t0, <4 x float> %p0, <4 x float> %p1, <4
; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
; X64-NEXT: cmpunordps %xmm0, %xmm0
; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
-; X64-NEXT: xorps %xmm0, %xmm0
-; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; X64-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 16-byte Reload
-; X64-NEXT: minps %xmm0, %xmm0
+; X64-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; X64-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
; X64-NEXT: xorl %ebx, %ebx
; X64-NEXT: xorps %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 7225257203161b2..596e465ee8cacf2 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -1082,11 +1082,12 @@ define void @main.158() #0 {
; BWON-F16C-LABEL: main.158:
; BWON-F16C: # %bb.0: # %entry
; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm1
-; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; BWON-F16C-NEXT: vucomiss %xmm1, %xmm2
+; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0
+; BWON-F16C-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0
+; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; BWON-F16C-NEXT: vucomiss %xmm0, %xmm1
+; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0
; BWON-F16C-NEXT: jae .LBB20_2
; BWON-F16C-NEXT: # %bb.1: # %entry
; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -1099,7 +1100,8 @@ define void @main.158() #0 {
; CHECK-I686-LABEL: main.158:
; CHECK-I686: # %bb.0: # %entry
; CHECK-I686-NEXT: subl $12, %esp
-; CHECK-I686-NEXT: movl $0, (%esp)
+; CHECK-I686-NEXT: pxor %xmm0, %xmm0
+; CHECK-I686-NEXT: movd %xmm0, (%esp)
; CHECK-I686-NEXT: calll __truncsfhf2
; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax
; CHECK-I686-NEXT: movw %ax, (%esp)
diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll
index f9872b10097a150..a2d2c5ca4301186 100644
--- a/llvm/test/CodeGen/X86/nontemporal-3.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-3.ll
@@ -93,66 +93,247 @@ define void @test_zero_v4f64_align1(ptr %dst) nounwind {
}
define void @test_zero_v8f32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v8f32_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v8f32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v8f32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorl %eax, %eax
+; SSE4A-NEXT: movntiq %rax, 8(%rdi)
+; SSE4A-NEXT: movntiq %rax, 24(%rdi)
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v8f32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8f32_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v8f32_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: retq
store <8 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v4i64_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v4i64_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v4i64_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v4i64_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v4i64_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v4i64_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v4i64_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: retq
store <4 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v8i32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v8i32_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v8i32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v8i32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v8i32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8i32_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v8i32_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: retq
store <8 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v16i16_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v16i16_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v16i16_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v16i16_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v16i16_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v16i16_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v16i16_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: retq
store <16 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v32i8_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v32i8_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v32i8_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v32i8_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v32i8_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v32i8_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v32i8_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: retq
store <32 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
@@ -327,86 +508,347 @@ define void @test_zero_v8f64_align1(ptr %dst) nounwind {
}
define void @test_zero_v16f32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v16f32_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: movntiq %rax, 40(%rdi)
-; CHECK-NEXT: movntiq %rax, 32(%rdi)
-; CHECK-NEXT: movntiq %rax, 56(%rdi)
-; CHECK-NEXT: movntiq %rax, 48(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v16f32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v16f32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorl %eax, %eax
+; SSE4A-NEXT: movntiq %rax, 8(%rdi)
+; SSE4A-NEXT: movntiq %rax, 24(%rdi)
+; SSE4A-NEXT: movntiq %rax, 40(%rdi)
+; SSE4A-NEXT: movntiq %rax, 56(%rdi)
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v16f32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v16f32_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v16f32_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: retq
store <16 x float> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v8i64_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v8i64_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: movntiq %rax, 40(%rdi)
-; CHECK-NEXT: movntiq %rax, 32(%rdi)
-; CHECK-NEXT: movntiq %rax, 56(%rdi)
-; CHECK-NEXT: movntiq %rax, 48(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v8i64_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v8i64_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v8i64_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v8i64_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v8i64_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: retq
store <8 x i64> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v16i32_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v16i32_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: movntiq %rax, 40(%rdi)
-; CHECK-NEXT: movntiq %rax, 32(%rdi)
-; CHECK-NEXT: movntiq %rax, 56(%rdi)
-; CHECK-NEXT: movntiq %rax, 48(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v16i32_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v16i32_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v16i32_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v16i32_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v16i32_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: retq
store <16 x i32> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v32i16_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v32i16_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: movntiq %rax, 40(%rdi)
-; CHECK-NEXT: movntiq %rax, 32(%rdi)
-; CHECK-NEXT: movntiq %rax, 56(%rdi)
-; CHECK-NEXT: movntiq %rax, 48(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v32i16_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v32i16_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v32i16_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v32i16_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v32i16_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: retq
store <32 x i16> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
define void @test_zero_v64i8_align1(ptr %dst) nounwind {
-; CHECK-LABEL: test_zero_v64i8_align1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movntiq %rax, 8(%rdi)
-; CHECK-NEXT: movntiq %rax, (%rdi)
-; CHECK-NEXT: movntiq %rax, 24(%rdi)
-; CHECK-NEXT: movntiq %rax, 16(%rdi)
-; CHECK-NEXT: movntiq %rax, 40(%rdi)
-; CHECK-NEXT: movntiq %rax, 32(%rdi)
-; CHECK-NEXT: movntiq %rax, 56(%rdi)
-; CHECK-NEXT: movntiq %rax, 48(%rdi)
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_zero_v64i8_align1:
+; SSE2: # %bb.0:
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movntiq %rax, 8(%rdi)
+; SSE2-NEXT: movntiq %rax, (%rdi)
+; SSE2-NEXT: movntiq %rax, 24(%rdi)
+; SSE2-NEXT: movntiq %rax, 16(%rdi)
+; SSE2-NEXT: movntiq %rax, 40(%rdi)
+; SSE2-NEXT: movntiq %rax, 32(%rdi)
+; SSE2-NEXT: movntiq %rax, 56(%rdi)
+; SSE2-NEXT: movntiq %rax, 48(%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_zero_v64i8_align1:
+; SSE4A: # %bb.0:
+; SSE4A-NEXT: xorps %xmm0, %xmm0
+; SSE4A-NEXT: movntsd %xmm0, 8(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 24(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 16(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 40(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 32(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 56(%rdi)
+; SSE4A-NEXT: movntsd %xmm0, 48(%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_zero_v64i8_align1:
+; SSE41: # %bb.0:
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movntiq %rax, 8(%rdi)
+; SSE41-NEXT: movntiq %rax, (%rdi)
+; SSE41-NEXT: movntiq %rax, 24(%rdi)
+; SSE41-NEXT: movntiq %rax, 16(%rdi)
+; SSE41-NEXT: movntiq %rax, 40(%rdi)
+; SSE41-NEXT: movntiq %rax, 32(%rdi)
+; SSE41-NEXT: movntiq %rax, 56(%rdi)
+; SSE41-NEXT: movntiq %rax, 48(%rdi)
+; SSE41-NEXT: retq
+;
+; AVX-LABEL: test_zero_v64i8_align1:
+; AVX: # %bb.0:
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movntiq %rax, 8(%rdi)
+; AVX-NEXT: movntiq %rax, (%rdi)
+; AVX-NEXT: movntiq %rax, 24(%rdi)
+; AVX-NEXT: movntiq %rax, 16(%rdi)
+; AVX-NEXT: movntiq %rax, 40(%rdi)
+; AVX-NEXT: movntiq %rax, 32(%rdi)
+; AVX-NEXT: movntiq %rax, 56(%rdi)
+; AVX-NEXT: movntiq %rax, 48(%rdi)
+; AVX-NEXT: retq
+;
+; AVX512-LABEL: test_zero_v64i8_align1:
+; AVX512: # %bb.0:
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movntiq %rax, 8(%rdi)
+; AVX512-NEXT: movntiq %rax, (%rdi)
+; AVX512-NEXT: movntiq %rax, 24(%rdi)
+; AVX512-NEXT: movntiq %rax, 16(%rdi)
+; AVX512-NEXT: movntiq %rax, 40(%rdi)
+; AVX512-NEXT: movntiq %rax, 32(%rdi)
+; AVX512-NEXT: movntiq %rax, 56(%rdi)
+; AVX512-NEXT: movntiq %rax, 48(%rdi)
+; AVX512-NEXT: retq
store <64 x i8> zeroinitializer, ptr %dst, align 1, !nontemporal !1
ret void
}
@@ -772,7 +1214,3 @@ define void @test_zero_v64i8_align32(ptr %dst) nounwind {
}
!1 = !{i32 1}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE2: {{.*}}
-; SSE41: {{.*}}
-; SSE4A: {{.*}}
diff --git a/llvm/test/CodeGen/X86/pr13577.ll b/llvm/test/CodeGen/X86/pr13577.ll
index ef359e740c09d7c..7511560d85f5191 100644
--- a/llvm/test/CodeGen/X86/pr13577.ll
+++ b/llvm/test/CodeGen/X86/pr13577.ll
@@ -29,8 +29,7 @@ declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
define float @pr26070() {
; CHECK-LABEL: pr26070:
; CHECK: ## %bb.0:
-; CHECK-NEXT: xorps %xmm0, %xmm0
-; CHECK-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; CHECK-NEXT: retq
%c = call float @copysignf(float 1.0, float undef) readnone
ret float %c
diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll
index 88dcd7798f0c3d2..7d1d139a38a520d 100644
--- a/llvm/test/CodeGen/X86/pr41619.ll
+++ b/llvm/test/CodeGen/X86/pr41619.ll
@@ -7,9 +7,10 @@ define void @foo(double %arg) {
; CHECK: ## %bb.0: ## %bb
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: vmovd %eax, %xmm0
+; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vmovq %xmm0, %rax
; CHECK-NEXT: movl %eax, (%rax)
-; CHECK-NEXT: movq $0, (%rax)
+; CHECK-NEXT: vmovlps %xmm1, (%rax)
; CHECK-NEXT: retq
bb:
%tmp = bitcast double %arg to i64
diff --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll
index 800dd59c262666d..99185277ba745d5 100644
--- a/llvm/test/CodeGen/X86/vec_zero_cse.ll
+++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll
@@ -15,8 +15,8 @@ define void @test1() {
; X32: # %bb.0:
; X32-NEXT: movl $0, M1+4
; X32-NEXT: movl $0, M1
-; X32-NEXT: movl $0, M2+4
-; X32-NEXT: movl $0, M2
+; X32-NEXT: xorps %xmm0, %xmm0
+; X32-NEXT: movlps %xmm0, M2
; X32-NEXT: retl
;
; X64-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index eb5fc1523c08a7d..23c37af1db2f7c9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -108,10 +108,11 @@ define void @PR46178(ptr %0) {
; X86-NEXT: vmovdqu (%eax), %ymm1
; X86-NEXT: vpmovqw %ymm0, %xmm0
; X86-NEXT: vpmovqw %ymm1, %xmm1
-; X86-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; X86-NEXT: vpsllw $8, %ymm0, %ymm0
-; X86-NEXT: vpsraw $8, %ymm0, %ymm0
-; X86-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,1]
+; X86-NEXT: vpsllw $8, %xmm0, %xmm0
+; X86-NEXT: vpsraw $8, %xmm0, %xmm0
+; X86-NEXT: vpsllw $8, %xmm1, %xmm1
+; X86-NEXT: vpsraw $8, %xmm1, %xmm1
+; X86-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; X86-NEXT: vmovdqu %ymm0, (%eax)
; X86-NEXT: vzeroupper
; X86-NEXT: retl
More information about the llvm-commits
mailing list