[llvm] 1ef0432 - [LoopInterchange] Support loop interchange with floating point reductions

Sun Feb 6 14:09:34 PST 2022

Author: Congzhe Cao
Date: 2022-02-06T17:04:47-05:00
New Revision: 1ef04326ec5f634b29868b3900e42683a32d10b1

URL: https://github.com/llvm/llvm-project/commit/1ef04326ec5f634b29868b3900e42683a32d10b1
DIFF: https://github.com/llvm/llvm-project/commit/1ef04326ec5f634b29868b3900e42683a32d10b1.diff

LOG: [LoopInterchange] Support loop interchange with floating point reductions

Enabled loop interchange support for floating point reductions
if it is allowed to reorder floating point operations.

Previously when we encouter a floating point PHI node in the
outer loop exit block, we bailed out since we could not detect
floating point reductions in the early days. Now we remove this
limiation since we are able to detect floating point reductions.

Reviewed By: #loopoptwg, Meinersbur

Differential Revision: https://reviews.llvm.org/D117450

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopInterchange.cpp
    llvm/test/Transforms/LoopInterchange/lcssa.ll
    llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index c2b065c4eb314..79ae4e3d352e2 100644

--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -733,8 +733,12 @@ static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
       if (PHI->getNumIncomingValues() == 1)
         continue;
       RecurrenceDescriptor RD;
-      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+      if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD)) {
+        // Detect floating point reduction only when it can be reordered.
+        if (RD.getExactFPMathInst() != nullptr)
+          return nullptr;
         return PHI;
+      }
       return nullptr;
     }
   }
@@ -893,28 +897,23 @@ areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
 static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
   BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
   for (PHINode &PHI : LoopNestExit->phis()) {
-    //  FIXME: We currently are not able to detect floating point reductions
-    //         and have to use floating point PHIs as a proxy to prevent
-    //         interchanging in the presence of floating point reductions.
-    if (PHI.getType()->isFloatingPointTy())
-      return false;
     for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
-     Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
-     if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
-       continue;
-
-     // The incoming value is defined in the outer loop latch. Currently we
-     // only support that in case the outer loop latch has a single predecessor.
-     // This guarantees that the outer loop latch is executed if and only if
-     // the inner loop is executed (because tightlyNested() guarantees that the
-     // outer loop header only branches to the inner loop or the outer loop
-     // latch).
-     // FIXME: We could weaken this logic and allow multiple predecessors,
-     //        if the values are produced outside the loop latch. We would need
-     //        additional logic to update the PHI nodes in the exit block as
-     //        well.
-     if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
-       return false;
+      Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+      if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+        continue;
+
+      // The incoming value is defined in the outer loop latch. Currently we
+      // only support that in case the outer loop latch has a single predecessor.
+      // This guarantees that the outer loop latch is executed if and only if
+      // the inner loop is executed (because tightlyNested() guarantees that the
+      // outer loop header only branches to the inner loop or the outer loop
+      // latch).
+      // FIXME: We could weaken this logic and allow multiple predecessors,
+      //        if the values are produced outside the loop latch. We would need
+      //        additional logic to update the PHI nodes in the exit block as
+      //        well.
+      if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+        return false;
     }
   }
   return true;

diff  --git a/llvm/test/Transforms/LoopInterchange/lcssa.ll b/llvm/test/Transforms/LoopInterchange/lcssa.ll
index 8dd449ba588be..9809689bb0494 100644
--- a/llvm/test/Transforms/LoopInterchange/lcssa.ll
+++ b/llvm/test/Transforms/LoopInterchange/lcssa.ll
@@ -135,9 +135,8 @@ for.end16:                                        ; preds = %for.exit
   ret void
 }
 
-; FIXME: We currently do not support LCSSA phi nodes involving floating point
-;        types, as we fail to detect floating point reductions for now.
-; REMARK: UnsupportedPHIOuter
+; Loops with floating point reductions are interchanged with fastmath.
+; REMARK: Interchanged
 ; REMARK-NEXT: lcssa_04
 
 define void @lcssa_04() {
@@ -146,28 +145,31 @@ entry:
 
 outer.header:                                     ; preds = %outer.inc, %entry
   %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
-  %float.outer = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %outer.inc ]
+  %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.outer.next, %outer.inc ]
   br label %for.body3
 
 for.body3:                                        ; preds = %for.body3, %outer.header
   %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %float.inner = phi float [ %float.inner.next, %for.body3 ], [ %float.outer, %outer.header ]
   %arrayidx5 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @A, i64 0, i64 %iv.inner, i64 %iv.outer
   %vA = load i32, i32* %arrayidx5
   %arrayidx9 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* @C, i64 0, i64 %iv.inner, i64 %iv.outer
   %vC = load i32, i32* %arrayidx9
   %add = add nsw i32 %vA, %vC
+  %float.inner.next = fadd fast float %float.inner, 1.000000e+00
   store i32 %add, i32* %arrayidx5
   %iv.inner.next = add nuw nsw i64 %iv.inner, 1
   %exitcond = icmp eq i64 %iv.inner.next, 100
   br i1 %exitcond, label %outer.inc, label %for.body3
 
 outer.inc:                                        ; preds = %for.body3
+  %float.outer.next = phi float [ %float.inner.next, %for.body3 ]
   %iv.outer.next = add nsw i64 %iv.outer, 1
   %cmp = icmp eq i64 %iv.outer.next, 100
   br i1 %cmp, label %outer.header, label %for.exit
 
 for.exit:                                         ; preds = %outer.inc
-  %float.outer.lcssa = phi float [ %float.outer, %outer.inc ]
+  %float.outer.lcssa = phi float [ %float.outer.next, %outer.inc ]
   store float %float.outer.lcssa, float* @F
   br label %for.end16
 

diff  --git a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
index 52e604a00df32..89b734d5f82fa 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
@@ -227,3 +227,83 @@ for1.loopexit:                                 ; preds = %for1.inc
   %il.res.lcssa2 = phi i64 [ %sum.inc.amend, %for1.inc ]
   ret i64 %il.res.lcssa2
 }
+
+; Floating point reductions are interchanged if all the fp instructions
+; involved allow reassociation.
+; REMARKS: --- !Passed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            Interchanged
+; REMARKS-NEXT: Function:        test5
+
+define float @test5([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) {
+entry:
+  br label %outer.header
+
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %outer.header
+  %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ]
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load float, float* %arrayidx5
+  %float.inner.inc = fadd fast float %float.inner, %vA
+  %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vB = load float, float* %arrayidx6
+  %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:                                        ; preds = %for.body3
+  %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ]
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:                                         ; preds = %outer.inc
+  %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ]
+  ret float %float.outer.lcssa
+}
+
+; Floating point reductions are not interchanged if not all the fp instructions
+; involved allow reassociation.
+; REMARKS: --- !Missed
+; REMARKS-NEXT: Pass:            loop-interchange
+; REMARKS-NEXT: Name:            UnsupportedPHIOuter
+; REMARKS-NEXT: Function:        test6
+
+define float @test6([100 x [100 x float]]* %Arr, [100 x [100 x float]]* %Arr2) {
+entry:
+  br label %outer.header
+
+outer.header:                                     ; preds = %outer.inc, %entry
+  %iv.outer = phi i64 [ 1, %entry ], [ %iv.outer.next, %outer.inc ]
+  %float.outer = phi float [ 1.000000e+00, %entry ], [ %float.inner.lcssa, %outer.inc ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %outer.header
+  %float.inner = phi float [ %float.outer , %outer.header ], [ %float.inner.inc.inc, %for.body3 ]
+  %iv.inner = phi i64 [ %iv.inner.next, %for.body3 ], [ 1, %outer.header ]
+  %arrayidx5 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vA = load float, float* %arrayidx5
+  %float.inner.inc = fadd float %float.inner, %vA ; do not allow reassociation
+  %arrayidx6 = getelementptr inbounds [100 x [100 x float]], [100 x [100 x float]]* %Arr2, i64 0, i64 %iv.inner, i64 %iv.outer
+  %vB = load float, float* %arrayidx6
+  %float.inner.inc.inc = fadd fast float %float.inner.inc, %vB
+  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
+  %exitcond = icmp eq i64 %iv.inner.next, 100
+  br i1 %exitcond, label %outer.inc, label %for.body3
+
+outer.inc:                                        ; preds = %for.body3
+  %float.inner.lcssa = phi float [ %float.inner.inc.inc, %for.body3 ]
+  %iv.outer.next = add nsw i64 %iv.outer, 1
+  %cmp = icmp eq i64 %iv.outer.next, 100
+  br i1 %cmp, label %outer.header, label %for.exit
+
+for.exit:                                         ; preds = %outer.inc
+  %float.outer.lcssa = phi float [ %float.inner.lcssa, %outer.inc ]
+  ret float %float.outer.lcssa
+}