[llvm] r318016 - [X86] Fix SQRTSS/SQRTSD/RCPSS/RCPSD intrinsics to use sse_load_f32/sse_load_f64 to increase load folding opportunities.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 12 21:25:25 PST 2017
Author: ctopper
Date: Sun Nov 12 21:25:24 2017
New Revision: 318016
URL: http://llvm.org/viewvc/llvm-project?rev=318016&view=rev
Log:
[X86] Fix SQRTSS/SQRTSD/RCPSS/RCPSD intrinsics to use sse_load_f32/sse_load_f64 to increase load folding opportunities.
Modified:
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/test/CodeGen/X86/fold-load-unops.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=318016&r1=318015&r2=318016&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Sun Nov 12 21:25:24 2017
@@ -7588,11 +7588,10 @@ multiclass avx512_sqrt_scalar<bits<8> op
(_.VT _.RC:$src2),
(i32 FROUND_CURRENT))>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(X86fsqrtRnds (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src2))),
+ _.ScalarIntMemCPat:$src2,
(i32 FROUND_CURRENT))>;
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -7630,7 +7629,7 @@ let Predicates = [HasAVX512, OptForSize]
(!cast<Instruction>(NAME#SUFF#Zm)
(_.EltVT (IMPLICIT_DEF)), addr:$src)>;
- def : Pat<(Intr (scalar_to_vector (_.EltVT (load addr:$src2)))),
+ def : Pat<(Intr _.ScalarIntMemCPat:$src2),
(!cast<Instruction>(NAME#SUFF#Zm_Int)
(_.VT (IMPLICIT_DEF)), addr:$src2)>;
}
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=318016&r1=318015&r2=318016&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Sun Nov 12 21:25:24 2017
@@ -3040,6 +3040,7 @@ def SSE_RCPS : OpndItins<
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
X86MemOperand x86memop,
+ Operand intmemop, ComplexPattern int_cpat,
Intrinsic Intr,
SDNode OpNode, Domain d, OpndItins itins,
Predicate target, string Suffix> {
@@ -3060,7 +3061,7 @@ multiclass sse_fp_unop_s<bits<8> opc, st
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let mayLoad = 1 in
- def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -3080,7 +3081,7 @@ multiclass sse_fp_unop_s<bits<8> opc, st
// which has a clobber before the rcp, vs.
// rcpss mem, %xmm0
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+ def : Pat<(Intr int_cpat:$src2),
(!cast<Instruction>(NAME#Suffix##m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
}
@@ -3089,6 +3090,7 @@ multiclass sse_fp_unop_s<bits<8> opc, st
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
X86MemOperand x86memop,
+ Operand intmemop, ComplexPattern int_cpat,
Intrinsic Intr, SDNode OpNode, Domain d,
OpndItins itins, Predicate target, string Suffix> {
let hasSideEffects = 0 in {
@@ -3106,7 +3108,7 @@ multiclass avx_fp_unop_s<bits<8> opc, st
[]>, Sched<[itins.Sched.Folded]>;
let mayLoad = 1 in
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, x86memop:$src2),
+ (ins VR128:$src1, intmemop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -3129,7 +3131,7 @@ multiclass avx_fp_unop_s<bits<8> opc, st
VR128:$src)>;
}
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+ def : Pat<(Intr int_cpat:$src2),
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
def : Pat<(ScalarVT (OpNode (load addr:$src))),
@@ -3213,10 +3215,11 @@ let Predicates = [HasAVX, NoVLX] in {
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins, Predicate AVXTarget> {
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
+ ssmem, sse_load_f32,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
- f32mem,
+ f32mem, ssmem, sse_load_f32,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
VEX_LIG, VEX_WIG, NotMemoryFoldable;
@@ -3225,10 +3228,11 @@ multiclass sse1_fp_unop_s<bits<8> opc, s
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins, Predicate AVXTarget> {
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
+ sdmem, sse_load_f64,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
- f64mem,
+ f64mem, sdmem, sse_load_f64,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
Modified: llvm/trunk/test/CodeGen/X86/fold-load-unops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fold-load-unops.ll?rev=318016&r1=318015&r2=318016&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fold-load-unops.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fold-load-unops.ll Sun Nov 12 21:25:24 2017
@@ -101,14 +101,12 @@ define float @rcpss_size(float* %a) opts
define <4 x float> @rcpss_full_size(<4 x float>* %a) optsize {
; SSE-LABEL: rcpss_full_size:
; SSE: # BB#0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: rcpss %xmm0, %xmm0
+; SSE-NEXT: rcpss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rcpss_full_size:
; AVX: # BB#0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vrcpss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <4 x float>, <4 x float>* %a
%res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld)
@@ -135,14 +133,12 @@ define float @rsqrtss_size(float* %a) op
define <4 x float> @rsqrtss_full_size(<4 x float>* %a) optsize {
; SSE-LABEL: rsqrtss_full_size:
; SSE: # BB#0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: rsqrtss %xmm0, %xmm0
+; SSE-NEXT: rsqrtss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: rsqrtss_full_size:
; AVX: # BB#0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <4 x float>, <4 x float>* %a
%res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld)
@@ -169,14 +165,12 @@ define float @sqrtss_size(float* %a) opt
define <4 x float> @sqrtss_full_size(<4 x float>* %a) optsize{
; SSE-LABEL: sqrtss_full_size:
; SSE: # BB#0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: sqrtss %xmm0, %xmm0
+; SSE-NEXT: sqrtss (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtss_full_size:
; AVX: # BB#0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <4 x float>, <4 x float>* %a
%res = tail call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %ld)
@@ -203,14 +197,12 @@ define double @sqrtsd_size(double* %a) o
define <2 x double> @sqrtsd_full_size(<2 x double>* %a) optsize {
; SSE-LABEL: sqrtsd_full_size:
; SSE: # BB#0:
-; SSE-NEXT: movapd (%rdi), %xmm0
-; SSE-NEXT: sqrtsd %xmm0, %xmm0
+; SSE-NEXT: sqrtsd (%rdi), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: sqrtsd_full_size:
; AVX: # BB#0:
-; AVX-NEXT: vmovapd (%rdi), %xmm0
-; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%ld = load <2 x double>, <2 x double>* %a
%res = tail call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %ld)
More information about the llvm-commits
mailing list