[llvm] cf81714 - [X86] Model MXCSR for AVX instructions other than AVX512

Mon Dec 2 16:54:07 PST 2019

Author: Wang, Pengfei
Date: 2019-12-03T08:53:47+08:00
New Revision: cf81714a7eb367260f9e6ae5f3bb11bb63d39124

URL: https://github.com/llvm/llvm-project/commit/cf81714a7eb367260f9e6ae5f3bb11bb63d39124
DIFF: https://github.com/llvm/llvm-project/commit/cf81714a7eb367260f9e6ae5f3bb11bb63d39124.diff

LOG: [X86] Model MXCSR for AVX instructions other than AVX512

Summary: Model MXCSR for AVX instructions other than AVX512

Reviewers: craig.topper, RKSimon

Subscribers: hiraditya, llvm-commits, LuoYuanke, LiuChen3

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70875

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86InstrFMA.td
    llvm/lib/Target/X86/X86InstrSSE.td
    llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
    llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 0cca71bdc431..4c84f4f2460d 100644

--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -95,7 +95,8 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
                    Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
 }
 
-let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1,
+    Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy, string Suff,
                        PatFrag MemFrag128, PatFrag MemFrag256,
@@ -237,7 +238,7 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
 }
 
 let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
-    hasSideEffects = 0 in
+    hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpStr, string PackTy, string Suff,
                        SDNode OpNode, RegisterClass RC,
@@ -263,7 +264,8 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 // the lowest element of the FMA*_Int instruction. Even though such analysis
 // may be not implemented yet we allow the routines doing the actual commute
 // transformation to decide if one or another instruction is commutable or not.
-let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0,
+    Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
                         Operand memopr, RegisterClass RC,
                         X86FoldableSchedWrite sched> {
@@ -384,6 +386,7 @@ defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR6
 // FMA4 - AMD 4 operand Fused Multiply-Add instructions
 //===----------------------------------------------------------------------===//
 
+let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
                  X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
                  PatFrag mem_frag, X86FoldableSchedWrite sched> {
@@ -425,7 +428,8 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
                      ValueType VT, X86FoldableSchedWrite sched> {
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0,
+    Uses = [MXCSR], mayRaiseFPException = 1 in {
   def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
@@ -458,6 +462,7 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
 } // isCodeGenOnly = 1
 }
 
+let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                  ValueType OpVT128, ValueType OpVT256,
                  PatFrag ld_frag128, PatFrag ld_frag256,

diff  --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index ed376d4ce96f..b8e80bcd566a 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -5542,7 +5542,7 @@ let ExeDomain = SSEPackedDouble in {
 
 // FP round - roundss, roundps, roundsd, roundpd
 let Predicates = [HasAVX, NoVLX] in {
-  let ExeDomain = SSEPackedSingle in {
+  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
     // Intrinsic form
     defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
                                      loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
@@ -5552,7 +5552,7 @@ let Predicates = [HasAVX, NoVLX] in {
                                    VEX, VEX_L, VEX_WIG;
   }
 
-  let ExeDomain = SSEPackedDouble in {
+  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
     defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
                                      loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
                                    VEX, VEX_WIG;
@@ -5564,9 +5564,9 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseAVX] in {
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
                                   v4f32, v2f64, X86RndScales, 0>,
-                                  VEX_4V, VEX_LIG, VEX_WIG;
+                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
-                                VEX_4V, VEX_LIG, VEX_WIG;
+                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
 }
 
 let Predicates = [UseAVX] in {
@@ -7326,12 +7326,12 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
 }
 
 let Predicates = [HasF16C, NoVLX] in {
-  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
-  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
-                               WriteCvtPS2PHSt>;
+                               WriteCvtPS2PHSt>, SIMD_EXC;
   defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
-                               WriteCvtPS2PHYSt>, VEX_L;
+                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
   def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),

diff  --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
index da7653255a8d..d88becc251d8 100644
--- a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mattr=+mmx -stop-after finalize-isel -o - %s | FileCheck %s
+; RUN: llc -march=x86-64 -mattr=+mmx,+fma,+f16c -stop-after finalize-isel -o - %s | FileCheck %s
 ; This test ensures that the MXCSR is implicitly used by MMX FP instructions.
 
 define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
@@ -15,8 +15,31 @@ define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
   ret x86_mmx %5
 }
 
+define half @mxcsr_f16c(float %a) {
+; CHECK: VCVTPS2PH{{.*}}mxcsr
+; CHECK: VCVTPH2PS{{.*}}mxcsr
+  %res = fptrunc float %a to half
+  ret half %res
+}
+
+define <4 x float> @mxcsr_fma_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK: VFMADD{{.*}}mxcsr
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float>
+%a)
+  ret <4 x float> %res
+}
+
+define <4 x float> @mxcsr_fma_ps(<4 x float> %a, <4 x float> %b) {
+; CHECK: VFMADD{{.*}}mxcsr
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float>
+%a)
+  ret <4 x float> %res
+}
+
 declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>)
 declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
 declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>)
 declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
 declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>)
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)

diff  --git a/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s b/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
index c629351690cc..9044fbb4353d 100644
--- a/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
+++ b/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
@@ -6,5 +6,4 @@ CHECK-NEXT: key:
 CHECK-NEXT:   instructions:
 CHECK-NEXT:     VFMADDSS4rm
 CHECK:      register_initial_values:
-# FIXME: This will be changed to CHECK by the following patch that modeling MXCSR to VFMADDSS.
-CHECK-NOT:      MXCSR
+CHECK:      MXCSR