[llvm] r272651 - [X86][SSE4A] Added patterns for nontemporal stores of scalar float/doubles using MOVNTSD/MOVNTSS
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 14 02:43:38 PDT 2016
Author: rksimon
Date: Tue Jun 14 04:43:38 2016
New Revision: 272651
URL: http://llvm.org/viewvc/llvm-project?rev=272651&view=rev
Log:
[X86][SSE4A] Added patterns for nontemporal stores of scalar float/doubles using MOVNTSD/MOVNTSS
Modified:
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=272651&r1=272650&r2=272651&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Tue Jun 14 04:43:38 2016
@@ -7774,6 +7774,8 @@ def INSERTQ : I<0x79, MRMSrcReg, (outs
VR128:$mask))]>, XD;
}
+// Non-temporal (unaligned) scalar stores.
+let AddedComplexity = 400 in { // Prefer non-temporal versions
def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
"movntss\t{$src, $dst|$dst, $src}",
[(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
@@ -7781,7 +7783,15 @@ def MOVNTSS : I<0x2B, MRMDestMem, (outs)
def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movntsd\t{$src, $dst|$dst, $src}",
[(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
-}
+
+def : Pat<(nontemporalstore FR32:$src, addr:$dst),
+ (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+def : Pat<(nontemporalstore FR64:$src, addr:$dst),
+ (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+} // AddedComplexity
+} // HasSSE4A
//===----------------------------------------------------------------------===//
// AVX Instructions
@@ -8364,7 +8374,7 @@ let Predicates = [HasAVX2, NoVLX] in {
(VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
(VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
- }
+ }
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in {
Modified: llvm/trunk/test/CodeGen/X86/nontemporal-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/nontemporal-2.ll?rev=272651&r1=272650&r2=272651&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/nontemporal-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/nontemporal-2.ll Tue Jun 14 04:43:38 2016
@@ -386,10 +386,20 @@ define void @test_zero_v32i8(<32 x i8>*
; Scalar versions.
define void @test_arg_f32(float %arg, float* %dst) {
-; SSE-LABEL: test_arg_f32:
-; SSE: # BB#0:
-; SSE-NEXT: movss %xmm0, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_arg_f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movss %xmm0, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_f32:
; AVX: # BB#0:
@@ -424,10 +434,20 @@ define void @test_arg_i32(i32 %arg, i32*
}
define void @test_arg_f64(double %arg, double* %dst) {
-; SSE-LABEL: test_arg_f64:
-; SSE: # BB#0:
-; SSE-NEXT: movsd %xmm0, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_arg_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd %xmm0, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_f64:
; AVX: # BB#0:
@@ -473,7 +493,7 @@ define void @test_extract_f32(<4 x float
; SSE4A-LABEL: test_extract_f32:
; SSE4A: # BB#0:
; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE4A-NEXT: movss %xmm0, (%rdi)
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_extract_f32:
@@ -536,10 +556,21 @@ define void @test_extract_i32(<4 x i32>
}
define void @test_extract_f64(<2 x double> %arg, double* %dst) {
-; SSE-LABEL: test_extract_f64:
-; SSE: # BB#0:
-; SSE-NEXT: movhpd %xmm0, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_extract_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movhpd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movhpd %xmm0, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_extract_f64:
; AVX: # BB#0:
More information about the llvm-commits
mailing list