[llvm] r366222 - [Strict FP] Allow more relaxed scheduling

Tue Jul 16 08:55:45 PDT 2019

Author: uweigand
Date: Tue Jul 16 08:55:45 2019
New Revision: 366222

URL: http://llvm.org/viewvc/llvm-project?rev=366222&view=rev
Log:
[Strict FP] Allow more relaxed scheduling

Reimplement scheduling constraints for strict FP instructions in
ScheduleDAGInstrs::buildSchedGraph to allow for more relaxed
scheduling.  Specifially, allow one strict FP instruction to
be scheduled across another, as long as it is not moved across
any global barrier.

Differential Revision: https://reviews.llvm.org/D64412

Reviewed By: cameron.mcinally


Modified:
    llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp
    llvm/trunk/test/CodeGen/SystemZ/fp-strict-alias.ll
    llvm/trunk/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll

Modified: llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp?rev=366222&r1=366221&r2=366222&view=diff
==============================================================================

--- llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp (original)
+++ llvm/trunk/lib/CodeGen/ScheduleDAGInstrs.cpp Tue Jul 16 08:55:45 2019
@@ -712,7 +712,6 @@ void ScheduleDAGInstrs::buildSchedGraph(
   AAForDep = UseAA ? AA : nullptr;
 
   BarrierChain = nullptr;
-  SUnit *FPBarrierChain = nullptr;
 
   this->TrackLaneMasks = TrackLaneMasks;
   MISUnitMap.clear();
@@ -744,6 +743,14 @@ void ScheduleDAGInstrs::buildSchedGraph(
   // done.
   Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
 
+  // Track all instructions that may raise floating-point exceptions.
+  // These do not depend on one other (or normal loads or stores), but
+  // must not be rescheduled across global barriers.  Note that we don't
+  // really need a "map" here since we don't track those MIs by value;
+  // using the same Value2SUsMap data type here is simply a matter of
+  // convenience.
+  Value2SUsMap FPExceptions;
+
   // Remove any stale debug info; sometimes BuildSchedGraph is called again
   // without emitting the info from the previous call.
   DbgValues.clear();
@@ -871,20 +878,24 @@ void ScheduleDAGInstrs::buildSchedGraph(
       addBarrierChain(Loads);
       addBarrierChain(NonAliasStores);
       addBarrierChain(NonAliasLoads);
-
-      // Add dependency against previous FP barrier and reset FP barrier.
-      if (FPBarrierChain)
-        FPBarrierChain->addPredBarrier(BarrierChain);
-      FPBarrierChain = BarrierChain;
+      addBarrierChain(FPExceptions);
 
       continue;
     }
 
-    // Instructions that may raise FP exceptions depend on each other.
+    // Instructions that may raise FP exceptions may not be moved
+    // across any global barriers.
     if (MI.mayRaiseFPException()) {
-      if (FPBarrierChain)
-        FPBarrierChain->addPredBarrier(SU);
-      FPBarrierChain = SU;
+      if (BarrierChain)
+        BarrierChain->addPredBarrier(SU);
+
+      FPExceptions.insert(SU, UnknownValue);
+
+      if (FPExceptions.size() >= HugeRegion) {
+        LLVM_DEBUG(dbgs() << "Reducing FPExceptions map.\n";);
+        Value2SUsMap empty;
+        reduceHugeMemNodeMaps(FPExceptions, empty, getReductionSize());
+      }
     }
 
     // If it's not a store or a variant load, we're done.

Modified: llvm/trunk/test/CodeGen/SystemZ/fp-strict-alias.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/SystemZ/fp-strict-alias.ll?rev=366222&r1=366221&r2=366222&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/SystemZ/fp-strict-alias.ll (original)
+++ llvm/trunk/test/CodeGen/SystemZ/fp-strict-alias.ll Tue Jul 16 08:55:45 2019
@@ -2,138 +2,216 @@
 ;
 ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
 
-declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
-declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata)
 declare float @llvm.sqrt.f32(float)
 declare void @llvm.s390.sfpc(i32)
 
-; For non-strict operations, we expect the post-RA scheduler to
-; separate the two square root instructions on z13.
-define void @f1(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
+; The basic assumption of all following tests is that on z13, we never
+; want to see two square root instructions directly in a row, so the
+; post-RA scheduler will always schedule something else in between
+; whenever possible.
+
+; We can move any FP operation across a (normal) store.
+
+define void @f1(float %f1, float %f2, float *%ptr1, float *%ptr2) {
 ; CHECK-LABEL: f1:
 ; CHECK: sqebr
-; CHECK: {{aebr|sebr}}
+; CHECK: ste
 ; CHECK: sqebr
+; CHECK: ste
 ; CHECK: br %r14
 
-  %add = fadd float %f1, %f2
-  %sub = fsub float %f3, %f4
-  %sqrt1 = call float @llvm.sqrt.f32(float %f2)
-  %sqrt2 = call float @llvm.sqrt.f32(float %f4)
-
-  %ptr1 = getelementptr float, float *%ptr0, i64 1
-  %ptr2 = getelementptr float, float *%ptr0, i64 2
-  %ptr3 = getelementptr float, float *%ptr0, i64 3
+  %sqrt1 = call float @llvm.sqrt.f32(float %f1)
+  %sqrt2 = call float @llvm.sqrt.f32(float %f2)
 
-  store float %add, float *%ptr0
-  store float %sub, float *%ptr1
-  store float %sqrt1, float *%ptr2
-  store float %sqrt2, float *%ptr3
+  store float %sqrt1, float *%ptr1
+  store float %sqrt2, float *%ptr2
 
   ret void
 }
 
-; But for strict operations, this must not happen.
-define void @f2(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
+define void @f2(float %f1, float %f2, float *%ptr1, float *%ptr2) {
 ; CHECK-LABEL: f2:
-; CHECK: {{aebr|sebr}}
-; CHECK: {{aebr|sebr}}
 ; CHECK: sqebr
+; CHECK: ste
 ; CHECK: sqebr
+; CHECK: ste
 ; CHECK: br %r14
 
-  %add = call float @llvm.experimental.constrained.fadd.f32(
-                        float %f1, float %f2,
+  %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f1,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
-  %sub = call float @llvm.experimental.constrained.fsub.f32(
-                        float %f3, float %f4,
+                        metadata !"fpexcept.ignore")
+  %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f2,
                         metadata !"round.dynamic",
-                        metadata !"fpexcept.strict")
+                        metadata !"fpexcept.ignore")
+
+  store float %sqrt1, float *%ptr1
+  store float %sqrt2, float *%ptr2
+
+  ret void
+}
+
+define void @f3(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f3:
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: br %r14
+
   %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
-                        float %f2,
+                        float %f1,
                         metadata !"round.dynamic",
                         metadata !"fpexcept.strict")
   %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
-                        float %f4,
+                        float %f2,
                         metadata !"round.dynamic",
                         metadata !"fpexcept.strict")
 
-  %ptr1 = getelementptr float, float *%ptr0, i64 1
-  %ptr2 = getelementptr float, float *%ptr0, i64 2
-  %ptr3 = getelementptr float, float *%ptr0, i64 3
+  store float %sqrt1, float *%ptr1
+  store float %sqrt2, float *%ptr2
 
-  store float %add, float *%ptr0
-  store float %sub, float *%ptr1
-  store float %sqrt1, float *%ptr2
-  store float %sqrt2, float *%ptr3
+  ret void
+}
+
+
+; We can move a non-strict FP operation or a fpexcept.ignore
+; operation even across a volatile store, but not a fpexcept.strict
+; operation.
+
+define void @f4(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f4:
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: br %r14
+
+  %sqrt1 = call float @llvm.sqrt.f32(float %f1)
+  %sqrt2 = call float @llvm.sqrt.f32(float %f2)
+
+  store volatile float %sqrt1, float *%ptr1
+  store volatile float %sqrt2, float *%ptr2
 
   ret void
 }
 
-; On the other hand, strict operations that use the fpexcept.ignore
-; exception behaviour should be scheduled freely.
-define void @f3(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
-; CHECK-LABEL: f3:
+define void @f5(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f5:
 ; CHECK: sqebr
-; CHECK: {{aebr|sebr}}
+; CHECK: ste
 ; CHECK: sqebr
+; CHECK: ste
 ; CHECK: br %r14
 
-  %add = call float @llvm.experimental.constrained.fadd.f32(
-                        float %f1, float %f2,
+  %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f1,
                         metadata !"round.dynamic",
                         metadata !"fpexcept.ignore")
-  %sub = call float @llvm.experimental.constrained.fsub.f32(
-                        float %f3, float %f4,
+  %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f2,
                         metadata !"round.dynamic",
                         metadata !"fpexcept.ignore")
+
+  store volatile float %sqrt1, float *%ptr1
+  store volatile float %sqrt2, float *%ptr2
+
+  ret void
+}
+
+define void @f6(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f6:
+; CHECK: sqebr
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
+; CHECK: br %r14
+
   %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f1,
+                        metadata !"round.dynamic",
+                        metadata !"fpexcept.strict")
+  %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
                         float %f2,
                         metadata !"round.dynamic",
+                        metadata !"fpexcept.strict")
+
+  store volatile float %sqrt1, float *%ptr1
+  store volatile float %sqrt2, float *%ptr2
+
+  ret void
+}
+
+
+; No variant of FP operations can be scheduled across a SPFC.
+
+define void @f7(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f7:
+; CHECK: sqebr
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
+; CHECK: br %r14
+
+  %sqrt1 = call float @llvm.sqrt.f32(float %f1)
+  %sqrt2 = call float @llvm.sqrt.f32(float %f2)
+
+  call void @llvm.s390.sfpc(i32 0)
+
+  store float %sqrt1, float *%ptr1
+  store float %sqrt2, float *%ptr2
+
+  ret void
+}
+
+define void @f8(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f8:
+; CHECK: sqebr
+; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
+; CHECK: br %r14
+
+  %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f1,
+                        metadata !"round.dynamic",
                         metadata !"fpexcept.ignore")
   %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
-                        float %f4,
+                        float %f2,
                         metadata !"round.dynamic",
                         metadata !"fpexcept.ignore")
 
-  %ptr1 = getelementptr float, float *%ptr0, i64 1
-  %ptr2 = getelementptr float, float *%ptr0, i64 2
-  %ptr3 = getelementptr float, float *%ptr0, i64 3
+  call void @llvm.s390.sfpc(i32 0)
 
-  store float %add, float *%ptr0
-  store float %sub, float *%ptr1
-  store float %sqrt1, float *%ptr2
-  store float %sqrt2, float *%ptr3
+  store float %sqrt1, float *%ptr1
+  store float %sqrt2, float *%ptr2
 
   ret void
 }
 
-; However, even non-strict operations must not be scheduled across an SFPC.
-define void @f4(float %f1, float %f2, float %f3, float %f4, float *%ptr0) {
-; CHECK-LABEL: f4:
-; CHECK: {{aebr|sebr}}
-; CHECK: {{aebr|sebr}}
-; CHECK: sfpc
+define void @f9(float %f1, float %f2, float *%ptr1, float *%ptr2) {
+; CHECK-LABEL: f9:
 ; CHECK: sqebr
 ; CHECK: sqebr
+; CHECK: ste
+; CHECK: ste
 ; CHECK: br %r14
 
-  %add = fadd float %f1, %f2
-  %sub = fsub float %f3, %f4
+  %sqrt1 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f1,
+                        metadata !"round.dynamic",
+                        metadata !"fpexcept.strict")
+  %sqrt2 = call float @llvm.experimental.constrained.sqrt.f32(
+                        float %f2,
+                        metadata !"round.dynamic",
+                        metadata !"fpexcept.strict")
+
   call void @llvm.s390.sfpc(i32 0)
-  %sqrt1 = call float @llvm.sqrt.f32(float %f2)
-  %sqrt2 = call float @llvm.sqrt.f32(float %f4)
 
-  %ptr1 = getelementptr float, float *%ptr0, i64 1
-  %ptr2 = getelementptr float, float *%ptr0, i64 2
-  %ptr3 = getelementptr float, float *%ptr0, i64 3
-
-  store float %add, float *%ptr0
-  store float %sub, float *%ptr1
-  store float %sqrt1, float *%ptr2
-  store float %sqrt2, float *%ptr3
+  store float %sqrt1, float *%ptr1
+  store float %sqrt2, float *%ptr2
 
   ret void
 }

Modified: llvm/trunk/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll?rev=366222&r1=366221&r2=366222&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll Tue Jul 16 08:55:45 2019
@@ -108,8 +108,8 @@ define void @constrained_vector_fdiv_v3f
 ; S390X-NEXT:    ldeb %f3, 0(%r1)
 ; S390X-NEXT:    larl %r1, .LCPI3_2
 ; S390X-NEXT:    ldeb %f4, 0(%r1)
-; S390X-NEXT:    ddb %f2, 0(%r2)
 ; S390X-NEXT:    ddbr %f3, %f1
+; S390X-NEXT:    ddb %f2, 0(%r2)
 ; S390X-NEXT:    ddbr %f4, %f0
 ; S390X-NEXT:    std %f4, 16(%r2)
 ; S390X-NEXT:    std %f3, 8(%r2)
@@ -659,16 +659,16 @@ entry:
 define void @constrained_vector_fmul_v3f64(<3 x double>* %a) {
 ; S390X-LABEL: constrained_vector_fmul_v3f64:
 ; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    ld %f0, 8(%r2)
 ; S390X-NEXT:    larl %r1, .LCPI13_0
-; S390X-NEXT:    ld %f0, 0(%r1)
-; S390X-NEXT:    ld %f1, 8(%r2)
+; S390X-NEXT:    ld %f1, 0(%r1)
 ; S390X-NEXT:    ld %f2, 16(%r2)
-; S390X-NEXT:    ldr %f3, %f0
+; S390X-NEXT:    mdbr %f0, %f1
+; S390X-NEXT:    ldr %f3, %f1
 ; S390X-NEXT:    mdb %f3, 0(%r2)
-; S390X-NEXT:    mdbr %f1, %f0
-; S390X-NEXT:    mdbr %f2, %f0
+; S390X-NEXT:    mdbr %f2, %f1
 ; S390X-NEXT:    std %f2, 16(%r2)
-; S390X-NEXT:    std %f1, 8(%r2)
+; S390X-NEXT:    std %f0, 8(%r2)
 ; S390X-NEXT:    std %f3, 0(%r2)
 ; S390X-NEXT:    br %r14
 ;
@@ -832,16 +832,16 @@ entry:
 define void @constrained_vector_fadd_v3f64(<3 x double>* %a) {
 ; S390X-LABEL: constrained_vector_fadd_v3f64:
 ; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    ld %f0, 8(%r2)
 ; S390X-NEXT:    larl %r1, .LCPI18_0
-; S390X-NEXT:    ld %f0, 0(%r1)
-; S390X-NEXT:    ld %f1, 8(%r2)
+; S390X-NEXT:    ld %f1, 0(%r1)
 ; S390X-NEXT:    ld %f2, 16(%r2)
-; S390X-NEXT:    ldr %f3, %f0
+; S390X-NEXT:    adbr %f0, %f1
+; S390X-NEXT:    ldr %f3, %f1
 ; S390X-NEXT:    adb %f3, 0(%r2)
-; S390X-NEXT:    adbr %f1, %f0
-; S390X-NEXT:    adbr %f2, %f0
+; S390X-NEXT:    adbr %f2, %f1
 ; S390X-NEXT:    std %f2, 16(%r2)
-; S390X-NEXT:    std %f1, 8(%r2)
+; S390X-NEXT:    std %f0, 8(%r2)
 ; S390X-NEXT:    std %f3, 0(%r2)
 ; S390X-NEXT:    br %r14
 ;
@@ -969,14 +969,14 @@ define <3 x float> @constrained_vector_f
 ; S390X:       # %bb.0: # %entry
 ; S390X-NEXT:    larl %r1, .LCPI22_0
 ; S390X-NEXT:    le %f0, 0(%r1)
-; S390X-NEXT:    lzer %f1
 ; S390X-NEXT:    ler %f4, %f0
-; S390X-NEXT:    sebr %f4, %f1
 ; S390X-NEXT:    larl %r1, .LCPI22_1
 ; S390X-NEXT:    ler %f2, %f0
 ; S390X-NEXT:    seb %f2, 0(%r1)
 ; S390X-NEXT:    larl %r1, .LCPI22_2
 ; S390X-NEXT:    seb %f0, 0(%r1)
+; S390X-NEXT:    lzer %f1
+; S390X-NEXT:    sebr %f4, %f1
 ; S390X-NEXT:    br %r14
 ;
 ; SZ13-LABEL: constrained_vector_fsub_v3f32: