[PATCH][X86] Fix missed selection of non-temporal store of zero vector (PR19370).
Andrea Di Biagio
andrea.dibiagio at gmail.com
Thu Oct 16 10:41:29 PDT 2014
Hi,
This is a fix for PR19370.
Currently the x86 backend wrongly selects a a normal vector store
instead of a non-temporal store if the value to store is a vector of
all zeros.
Small reproducible:
;;
define void @test(<4 x float>* %dst) {
store <4 x float> zeroinitializer, <4 x float>* %dst, align 16,
!nontemporal !1
ret void
}
!1 = metadata !{i32 1}
;;
llc (-mtriple=x86_64-unknown-unknown -mcpu=corei7-avx) generates:
vxorps %xmm0, %xmm0, %xmm0
vmovaps %xmm0, (%rdi)
retq
Instead, llc should generate:
vxorps %xmm0, %xmm0, %xmm0
vmovntps %xmm0, (%rdi)
retq
In this example, the vector of all zeros is legalized to a zero vector
of type v4i32.
However, ISel doesn't have a rule to select a MOVNTPSmr when the
source is not a vector of float elements. So, it eventually (wrongly)
falls back to selecting a normal store.
This patch fixes the problem adding extra ISel patterns to cover that
particular corner case.
Please let me know if ok to commit.
Thanks!
Andrea
-------------- next part --------------
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td (revision 219931)
+++ lib/Target/X86/X86InstrSSE.td (working copy)
@@ -3939,6 +3939,14 @@
PS, Requires<[HasSSE2]>;
} // SchedRW = [WriteStore]
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(alignednontemporalstore (v4i32 immAllZerosV), addr:$dst),
+ (VMOVNTPSmr addr:$dst, (v4i32 (V_SET0)))>;
+}
+
+def : Pat<(alignednontemporalstore (v4i32 immAllZerosV), addr:$dst),
+ (MOVNTPSmr addr:$dst, (v4i32 (V_SET0)))>;
+
} // AddedComplexity
//===----------------------------------------------------------------------===//
Index: test/CodeGen/X86/nontemporal-2.ll
===================================================================
--- test/CodeGen/X86/nontemporal-2.ll (revision 0)
+++ test/CodeGen/X86/nontemporal-2.ll (working copy)
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck -check-prefix=CHECK -check-prefix=AVX
+
+
+; Make sure that we generate non-temporal stores for the test cases below.
+
+define void @test1(<4 x float>* %dst) {
+; CHECK-LABEL: test1:
+; SSE: movntps
+; AVX: vmovntps
+ store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test2(<4 x i32>* %dst) {
+; CHECK-LABEL: test2:
+; SSE: movntps
+; AVX: vmovntps
+ store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
+ ret void
+}
+
+define void @test3(<2 x double>* %dst) {
+; CHECK-LABEL: test3:
+; SSE: movntps
+; AVX: vmovntps
+ store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
+ ret void
+}
+
+!1 = metadata !{i32 1}
More information about the llvm-commits
mailing list