[PATCH][X86] Fix missed selection of non-temporal store of zero vector (PR19370).

Andrea Di Biagio andrea.dibiagio at gmail.com
Thu Oct 16 10:41:29 PDT 2014


Hi,

This is a fix for PR19370.
Currently the x86 backend wrongly selects a a normal vector store
instead of a non-temporal store if the value to store is a vector of
all zeros.

Small reproducible:

;;
define void @test(<4 x float>* %dst) {
  store <4 x float> zeroinitializer, <4 x float>* %dst, align 16,
!nontemporal !1
  ret void
}

!1 = metadata !{i32 1}
;;

llc (-mtriple=x86_64-unknown-unknown -mcpu=corei7-avx) generates:
  vxorps  %xmm0, %xmm0, %xmm0
  vmovaps %xmm0, (%rdi)
  retq

Instead, llc should generate:
  vxorps  %xmm0, %xmm0, %xmm0
  vmovntps  %xmm0, (%rdi)
  retq

In this example, the vector of all zeros is legalized to a zero vector
of type v4i32.
However, ISel doesn't have a rule to select a MOVNTPSmr when the
source is not a vector of float elements. So, it eventually (wrongly)
falls back to selecting a normal store.

This patch fixes the problem adding extra ISel patterns to cover that
particular corner case.

Please let me know if ok to commit.

Thanks!
Andrea
-------------- next part --------------
Index: lib/Target/X86/X86InstrSSE.td
===================================================================
--- lib/Target/X86/X86InstrSSE.td	(revision 219931)
+++ lib/Target/X86/X86InstrSSE.td	(working copy)
@@ -3939,6 +3939,14 @@
                   PS, Requires<[HasSSE2]>;
 } // SchedRW = [WriteStore]
 
+let Predicates = [HasAVX, NoVLX] in {
+  def : Pat<(alignednontemporalstore (v4i32 immAllZerosV), addr:$dst),
+            (VMOVNTPSmr addr:$dst, (v4i32 (V_SET0)))>;
+}
+
+def : Pat<(alignednontemporalstore (v4i32 immAllZerosV), addr:$dst),
+          (MOVNTPSmr addr:$dst, (v4i32 (V_SET0)))>;
+
 } // AddedComplexity
 
 //===----------------------------------------------------------------------===//
Index: test/CodeGen/X86/nontemporal-2.ll
===================================================================
--- test/CodeGen/X86/nontemporal-2.ll	(revision 0)
+++ test/CodeGen/X86/nontemporal-2.ll	(working copy)
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck -check-prefix=CHECK -check-prefix=AVX
+
+
+; Make sure that we generate non-temporal stores for the test cases below.
+
+define void @test1(<4 x float>* %dst) {
+; CHECK-LABEL: test1:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test2(<4 x i32>* %dst) {
+; CHECK-LABEL: test2:
+; SSE: movntps
+; AVX: vmovntps
+  store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+define void @test3(<2 x double>* %dst) {
+; CHECK-LABEL: test3:
+; SSE: movntps
+; AVX: vmovntps
+  store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1
+  ret void
+}
+
+!1 = metadata !{i32 1}


More information about the llvm-commits mailing list