[llvm] r363032 - [X86] Add load folding isel patterns to scalar_math_patterns and AVX512_scalar_math_fp_patterns.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 10 21:30:53 PDT 2019
Author: ctopper
Date: Mon Jun 10 21:30:53 2019
New Revision: 363032
URL: http://llvm.org/viewvc/llvm-project?rev=363032&view=rev
Log:
[X86] Add load folding isel patterns to scalar_math_patterns and AVX512_scalar_math_fp_patterns.
Also add a FIXME for the peephole pass not being able to handle this.
Modified:
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
llvm/trunk/lib/Target/X86/X86InstrSSE.td
llvm/trunk/test/CodeGen/X86/fold-load-binops.ll
llvm/trunk/test/CodeGen/X86/sse-scalar-fp-arith.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=363032&r1=363031&r2=363032&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Mon Jun 10 21:30:53 2019
@@ -11871,6 +11871,12 @@ multiclass AVX512_scalar_math_fp_pattern
_.FRC:$src)))),
(!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
(_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
+ def : Pat<(MoveNode
+ (_.VT VR128X:$dst),
+ (_.VT (scalar_to_vector
+ (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src))))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
@@ -11884,6 +11890,16 @@ multiclass AVX512_scalar_math_fp_pattern
(_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)),
+ _.FRC:$src0))),
+ (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk)
+ (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
+ VK1WM:$mask, _.VT:$src1, addr:$src2)>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
@@ -11895,6 +11911,13 @@ multiclass AVX512_scalar_math_fp_pattern
(!cast<I>("V"#OpcPrefix#Zrr_Intkz)
VK1WM:$mask, _.VT:$src1,
(_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
+ def : Pat<(MoveNode (_.VT VR128X:$src1),
+ (scalar_to_vector
+ (X86selects VK1WM:$mask,
+ (Op (_.EltVT
+ (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+ (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
+ (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>;
}
}
Modified: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrInfo.cpp?rev=363032&r1=363031&r2=363032&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp Mon Jun 10 21:30:53 2019
@@ -4685,6 +4685,7 @@ MachineInstr *X86InstrInfo::foldMemoryOp
&RI, MF);
unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
if (Size < RCSize) {
+ // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=363032&r1=363031&r2=363032&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Jun 10 21:30:53 2019
@@ -2692,7 +2692,8 @@ let isCodeGenOnly = 1 in {
// patterns we have to try to match.
multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
ValueType VT, ValueType EltTy,
- RegisterClass RC, Predicate BasePredicate> {
+ RegisterClass RC, PatFrag ld_frag,
+ Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
// extracted scalar math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst),
@@ -2701,6 +2702,11 @@ multiclass scalar_math_patterns<SDNode O
RC:$src))))),
(!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst,
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
}
// Repeat for AVX versions of the instructions.
@@ -2712,18 +2718,23 @@ multiclass scalar_math_patterns<SDNode O
RC:$src))))),
(!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst,
(VT (COPY_TO_REGCLASS RC:$src, VR128)))>;
+ def : Pat<(VT (Move (VT VR128:$dst),
+ (VT (scalar_to_vector
+ (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
+ (ld_frag addr:$src)))))),
+ (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>;
}
}
-defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
-defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
+defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
-defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
-defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
+defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
Modified: llvm/trunk/test/CodeGen/X86/fold-load-binops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/fold-load-binops.ll?rev=363032&r1=363031&r2=363032&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/fold-load-binops.ll (original)
+++ llvm/trunk/test/CodeGen/X86/fold-load-binops.ll Mon Jun 10 21:30:53 2019
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
+; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
; Verify that we're folding the load into the math instruction.
; This pattern is generated out of the simplest intrinsics usage:
Modified: llvm/trunk/test/CodeGen/X86/sse-scalar-fp-arith.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/sse-scalar-fp-arith.ll?rev=363032&r1=363031&r2=363032&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/sse-scalar-fp-arith.ll (original)
+++ llvm/trunk/test/CodeGen/X86/sse-scalar-fp-arith.ll Mon Jun 10 21:30:53 2019
@@ -414,14 +414,12 @@ define <4 x float> @test_multiple_div_ss
define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
; X86-SSE-LABEL: blend_add_ss:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: addss %xmm1, %xmm0
+; X86-SSE-NEXT: addss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_add_ss:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_add_ss:
@@ -444,14 +442,12 @@ define <4 x float> @blend_add_ss(<4 x fl
define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
; X86-SSE-LABEL: blend_sub_ss:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: subss %xmm1, %xmm0
+; X86-SSE-NEXT: subss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_sub_ss:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_sub_ss:
@@ -474,14 +470,12 @@ define <4 x float> @blend_sub_ss(<4 x fl
define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
; X86-SSE-LABEL: blend_mul_ss:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: mulss %xmm1, %xmm0
+; X86-SSE-NEXT: mulss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_mul_ss:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_mul_ss:
@@ -504,14 +498,12 @@ define <4 x float> @blend_mul_ss(<4 x fl
define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
; X86-SSE-LABEL: blend_div_ss:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: divss %xmm1, %xmm0
+; X86-SSE-NEXT: divss {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_div_ss:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_div_ss:
@@ -534,14 +526,12 @@ define <4 x float> @blend_div_ss(<4 x fl
define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
; X86-SSE-LABEL: blend_add_sd:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT: addsd %xmm1, %xmm0
+; X86-SSE-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_add_sd:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vaddsd {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_add_sd:
@@ -564,14 +554,12 @@ define <2 x double> @blend_add_sd(<2 x d
define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
; X86-SSE-LABEL: blend_sub_sd:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT: subsd %xmm1, %xmm0
+; X86-SSE-NEXT: subsd {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_sub_sd:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vsubsd {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_sub_sd:
@@ -594,14 +582,12 @@ define <2 x double> @blend_sub_sd(<2 x d
define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
; X86-SSE-LABEL: blend_mul_sd:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT: mulsd %xmm1, %xmm0
+; X86-SSE-NEXT: mulsd {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_mul_sd:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vmulsd {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_mul_sd:
@@ -624,14 +610,12 @@ define <2 x double> @blend_mul_sd(<2 x d
define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
; X86-SSE-LABEL: blend_div_sd:
; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT: divsd %xmm1, %xmm0
+; X86-SSE-NEXT: divsd {{[0-9]+}}(%esp), %xmm0
; X86-SSE-NEXT: retl
;
; X86-AVX-LABEL: blend_div_sd:
; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; X86-AVX-NEXT: vdivsd {{[0-9]+}}(%esp), %xmm0, %xmm0
; X86-AVX-NEXT: retl
;
; X64-SSE-LABEL: blend_div_sd:
More information about the llvm-commits
mailing list