[llvm] r318022 - [X86] Use sse_load_f32/f64 to improve load folding of scalar vfscalefss/sd, vrcp14ss/sd, rsqrt14ss/sd instructions.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 13 00:07:33 PST 2017


Author: ctopper
Date: Mon Nov 13 00:07:33 2017
New Revision: 318022

URL: http://llvm.org/viewvc/llvm-project?rev=318022&view=rev
Log:
[X86] Use sse_load_f32/f64 to improve load folding of scalar vfscalefss/sd, vrcp14ss/sd, rsqrt14ss/sd instructions.

Modified:
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512-scalarIntrinsics.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=318022&r1=318021&r2=318022&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Mon Nov 13 00:07:33 2017
@@ -4942,10 +4942,9 @@ multiclass avx512_fp_scalef_scalar<bits<
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
   defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                  (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
-                  (OpNode _.RC:$src1,
-                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                  (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2,
                           (i32 FROUND_CURRENT))>;
   }
 }
@@ -7356,10 +7355,10 @@ multiclass avx512_fp14_s<bits<8> opc, st
                            "$src2, $src1", "$src1, $src2",
                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
   defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1),
-                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
+                          _.ScalarIntMemCPat:$src2)>, EVEX_4V;
 }
 }
 

Modified: llvm/trunk/test/CodeGen/X86/avx512-scalarIntrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-scalarIntrinsics.ll?rev=318022&r1=318021&r2=318022&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-scalarIntrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-scalarIntrinsics.ll Mon Nov 13 00:07:33 2017
@@ -11,6 +11,16 @@ define <4 x float> @test_rsqrt14_ss(<4 x
     %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
     ret <4 x float> %res
 }
+
+define <4 x float> @test_rsqrt14_ss_load(<4 x float> %a0, <4 x float>* %a1ptr) {
+; CHECK-LABEL: test_rsqrt14_ss_load:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrsqrt14ss (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = load <4 x float>, <4 x float>* %a1ptr
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1) ;
+  ret <4 x float> %res
+}
 declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
@@ -21,6 +31,16 @@ define <4 x float> @test_rcp14_ss(<4 x f
     %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ;
     ret <4 x float> %res
 }
+
+define <4 x float> @test_rcp14_ss_load(<4 x float> %a0, <4 x float>* %a1ptr) {
+; CHECK-LABEL: test_rcp14_ss_load:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrcp14ss (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = load <4 x float>, <4 x float>* %a1ptr
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1) ;
+  ret <4 x float> %res
+}
 declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <2 x double> @test_rsqrt14_sd(<2 x double> %a0) {
@@ -31,6 +51,16 @@ define <2 x double> @test_rsqrt14_sd(<2
     %res = call <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 -1) ;
     ret <2 x double> %res
 }
+
+define <2 x double> @test_rsqrt14_sd_load(<2 x double> %a0, <2 x double>* %a1ptr) {
+; CHECK-LABEL: test_rsqrt14_sd_load:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrsqrt14sd (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = load <2 x double>, <2 x double>* %a1ptr
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1) ;
+  ret <2 x double> %res
+}
 declare <2 x double> @llvm.x86.avx512.rsqrt14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
 
 define <2 x double> @test_rcp14_sd(<2 x double> %a0) {
@@ -42,6 +72,16 @@ define <2 x double> @test_rcp14_sd(<2 x
     ret <2 x double> %res
 
 }
+
+define <2 x double> @test_rcp14_sd_load(<2 x double> %a0, <2 x double>* %a1ptr) {
+; CHECK-LABEL: test_rcp14_sd_load:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vrcp14sd (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = load <2 x double>, <2 x double>* %a1ptr
+  %res = call <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1) ;
+  ret <2 x double> %res
+}
 declare <2 x double> @llvm.x86.avx512.rcp14.sd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
 
 declare <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float>, <4 x float>,<4 x float>, i8, i32)
@@ -67,6 +107,16 @@ define <4 x float>@test_int_x86_avx512_m
     ret <4 x float> %res2
 }
 
+define <4 x float>@test_int_x86_avx512_mask_scalef_ss_load(<4 x float> %x0, <4 x float>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ss_load:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vscalefss (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x1 = load <4 x float>, <4 x float>* %x1ptr
+  %res = call <4 x float> @llvm.x86.avx512.mask.scalef.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %res
+}
+
 declare <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double>, <2 x double>,<2 x double>, i8, i32)
 define <2 x double>@test_int_x86_avx512_mask_scalef_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
 ; SKX-LABEL: test_int_x86_avx512_mask_scalef_sd:
@@ -89,3 +139,13 @@ define <2 x double>@test_int_x86_avx512_
     %res2 = fadd <2 x double> %res, %res1
     ret <2 x double> %res2
 }
+
+define <2 x double>@test_int_x86_avx512_mask_scalef_sd_load(<2 x double> %x0, <2 x double>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sd_load:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vscalefsd (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x1 = load <2 x double>, <2 x double>* %x1ptr
+  %res = call <2 x double> @llvm.x86.avx512.mask.scalef.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> undef, i8 -1, i32 4)
+  ret <2 x double> %res
+}




More information about the llvm-commits mailing list