[llvm] e44f4a8 - [LoopFusion] Move instructions from FC1.GuardBlock to FC0.GuardBlock and

Thu Jan 30 10:02:38 PST 2020

Author: Whitney Tsang
Date: 2020-01-30T18:02:22Z
New Revision: e44f4a8a54141d5f527ed8ee05362cc98031d723

URL: https://github.com/llvm/llvm-project/commit/e44f4a8a54141d5f527ed8ee05362cc98031d723
DIFF: https://github.com/llvm/llvm-project/commit/e44f4a8a54141d5f527ed8ee05362cc98031d723.diff

LOG: [LoopFusion] Move instructions from FC1.GuardBlock to FC0.GuardBlock and
from FC0.ExitBlock to FC1.ExitBlock when proven safe.

Summary:
Currently LoopFusion give up when the second loop nest guard
block or the first loop nest exit block is not empty. For example:

if (0 < N) {
  for (int i = 0; i < N; ++i) {}
  x+=1;
}
y+=1;
if (0 < N) {
  for (int i = 0; i < N; ++i) {}
}
The above example should be safe to fuse.
This PR moves instructions in FC1 guard block (e.g. y+=1;) to
FC0 guard block, or instructions in FC0 exit block (e.g. x+=1;) to
FC1 exit block, which then LoopFusion is able to fuse them.
Reviewer: kbarton, jdoerfert, Meinersbur, dmgreen, fhahn, hfinkel,
bmahjour, etiotto
Reviewed By: jdoerfert
Subscribers: hiraditya, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D73641

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopFuse.cpp
    llvm/test/Transforms/LoopFusion/diagnostics_missed.ll
    llvm/test/Transforms/LoopFusion/guarded.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 24597e228870..8d591d783f2e 100644

--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -91,8 +91,10 @@ STATISTIC(
     "Loop has a non-empty preheader with instructions that cannot be moved");
 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
 STATISTIC(NonIdenticalGuards, "Candidates have 
diff erent guards");
-STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
-STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
+                             "instructions that cannot be moved");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
+                              "instructions that cannot be moved");
 STATISTIC(NotRotated, "Candidate is not rotated");
 
 enum FusionDependenceAnalysisChoice {
@@ -750,25 +752,30 @@ struct LoopFuser {
             continue;
           }
 
-          // The following two checks look for empty blocks in FC0 and FC1. If
-          // any of these blocks are non-empty, we do not fuse. This is done
-          // because we currently do not have the safety checks to determine if
-          // it is safe to move the blocks past other blocks in the loop. Once
-          // these checks are added, these conditions can be relaxed.
-          if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
-            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
-                                 "block. Not fusing.\n");
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
-                                                       NonEmptyExitBlock);
-            continue;
-          }
+          if (FC0->GuardBranch) {
+            assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
+
+            if (!isSafeToMoveBefore(*FC0->ExitBlock,
+                                    *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
+                                    PDT, DI)) {
+              LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
+                                   "instructions in exit block. Not fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyExitBlock);
+              continue;
+            }
 
-          if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
-            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
-                                 "block. Not fusing.\n");
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
-                                                       NonEmptyGuardBlock);
-            continue;
+            if (!isSafeToMoveBefore(
+                    *FC1->GuardBranch->getParent(),
+                    *FC0->GuardBranch->getParent()->getTerminator(), DT, PDT,
+                    DI)) {
+              LLVM_DEBUG(dbgs()
+                         << "Fusion candidate contains unsafe "
+                            "instructions in guard block. Not fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyGuardBlock);
+              continue;
+            }
           }
 
           // Check the dependencies across the loops and do not fuse if it would
@@ -1079,38 +1086,6 @@ struct LoopFuser {
       return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
   }
 
-  /// Check that the guard for \p FC *only* contains the cmp/branch for the
-  /// guard.
-  /// Once we are able to handle intervening code, any code in the guard block
-  /// for FC1 will need to be treated as intervening code and checked whether
-  /// it can safely move around the loops.
-  bool isEmptyGuardBlock(const FusionCandidate &FC) const {
-    assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
-    if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
-      auto *GuardBlock = FC.GuardBranch->getParent();
-      // If the generation of the cmp value is in GuardBlock, then the size of
-      // the guard block should be 2 (cmp + branch). If the generation of the
-      // cmp value is in a 
diff erent block, then the size of the guard block
-      // should only be 1.
-      if (CmpInst->getParent() == GuardBlock)
-        return GuardBlock->size() == 2;
-      else
-        return GuardBlock->size() == 1;
-    }
-
-    return false;
-  }
-
-  bool isEmptyPreheader(const FusionCandidate &FC) const {
-    assert(FC.Preheader && "Expecting a valid preheader");
-    return FC.Preheader->size() == 1;
-  }
-
-  bool isEmptyExitBlock(const FusionCandidate &FC) const {
-    assert(FC.ExitBlock && "Expecting a valid exit block");
-    return FC.ExitBlock->size() == 1;
-  }
-
   /// Simplify the condition of the latch branch of \p FC to true, when both of
   /// its successors are the same.
   void simplifyLatchBranch(const FusionCandidate &FC) const {
@@ -1390,6 +1365,14 @@ struct LoopFuser {
     BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
     BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
 
+    // Move instructions from the exit block of FC0 to the beginning of the exit
+    // block of FC1.
+    moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI);
+
+    // Move instructions from the guard block of FC1 to the end of the guard
+    // block of FC0.
+    moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
+
     assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
 
     SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;

diff  --git a/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll b/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll
index fe88f3d458b5..7bfc0e9c7b15 100644
--- a/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll
+++ b/llvm/test/Transforms/LoopFusion/diagnostics_missed.ll
@@ -213,6 +213,93 @@ for.second:
 for.end:
   ret void
 }
+
+; CHECK: remark: diagnostics_missed.c:67:3: [unsafe_exitblock]: for.first.preheader and for.second.preheader: Candidate has a non-empty exit block with instructions that cannot be moved
+define void @unsafe_exitblock(i32* noalias %A, i32* noalias %B, i64 %N) {
+for.first.guard:
+  %cmp3 = icmp slt i64 0, %N
+  br i1 %cmp3, label %for.first.preheader, label %for.second.guard
+
+for.first.preheader:
+  br label %for.first, !dbg !83
+
+for.first:
+  %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04
+  store i32 0, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.04, 1
+  %cmp = icmp slt i64 %inc, %N
+  br i1 %cmp, label %for.first, label %for.first.exit
+
+for.first.exit:
+  call void @bar()
+  br label %for.second.guard
+
+for.second.guard:
+  %cmp21 = icmp slt i64 0, %N
+  br i1 %cmp21, label %for.second.preheader, label %for.end
+
+for.second.preheader:
+  br label %for.second
+
+for.second:
+  %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ]
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02
+  store i32 0, i32* %arrayidx4, align 4
+  %inc6 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc6, %N
+  br i1 %cmp2, label %for.second, label %for.second.exit
+
+for.second.exit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK: remark: diagnostics_missed.c:72:3: [unsafe_guardblock]: for.first.preheader and for.second.preheader: Candidate has a non-empty guard block with instructions that cannot be moved
+define void @unsafe_guardblock(i32* noalias %A, i32* noalias %B, i64 %N) {
+for.first.guard:
+  %cmp3 = icmp slt i64 0, %N
+  br i1 %cmp3, label %for.first.preheader, label %for.second.guard
+
+for.first.preheader:
+  br label %for.first, !dbg !86
+
+for.first:
+  %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04
+  store i32 0, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.04, 1
+  %cmp = icmp slt i64 %inc, %N
+  br i1 %cmp, label %for.first, label %for.first.exit
+
+for.first.exit:
+  br label %for.second.guard
+
+for.second.guard:
+  call void @bar()
+  %cmp21 = icmp slt i64 0, %N
+  br i1 %cmp21, label %for.second.preheader, label %for.end
+
+for.second.preheader:
+  br label %for.second
+
+for.second:
+  %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ]
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02
+  store i32 0, i32* %arrayidx4, align 4
+  %inc6 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc6, %N
+  br i1 %cmp2, label %for.second, label %for.second.exit
+
+for.second.exit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
 declare void @bar()
 
 attributes #0 = { nounwind readnone speculatable willreturn }
@@ -301,3 +388,9 @@ attributes #0 = { nounwind readnone speculatable willreturn }
 !78 = !{}
 !79 = distinct !DILexicalBlock(scope: !77, file: !3, line: 3, column: 5)
 !80 = !DILocation(line: 62, column: 3, scope: !79)
+!81 = distinct !DISubprogram(name: "unsafe_exitblock", scope: !3, file: !3, line: 65, type: !15, scopeLine: 60, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !78)
+!82 = distinct !DILexicalBlock(scope: !81, file: !3, line: 3, column: 5)
+!83 = !DILocation(line: 67, column: 3, scope: !82)
+!84 = distinct !DISubprogram(name: "unsafe_guardblock", scope: !3, file: !3, line: 70, type: !15, scopeLine: 60, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !78)
+!85 = distinct !DILexicalBlock(scope: !84, file: !3, line: 3, column: 5)
+!86 = !DILocation(line: 72, column: 3, scope: !85)

diff  --git a/llvm/test/Transforms/LoopFusion/guarded.ll b/llvm/test/Transforms/LoopFusion/guarded.ll
index 9d64a39daab9..e3fc58c1f2a6 100644
--- a/llvm/test/Transforms/LoopFusion/guarded.ll
+++ b/llvm/test/Transforms/LoopFusion/guarded.ll
@@ -119,3 +119,116 @@ for.second.exit:
 for.end:
   ret void
 }
+
+; Test that `%add` is moved in for.second.exit, and the two loops for.first
+; and for.second are fused.
+
+; CHECK: void @moveinsts_exitblock
+; CHECK-LABEL: for.first.guard:
+; CHECK: br i1 %cmp.guard, label %for.first.preheader, label %for.end
+; CHECK-LABEL: for.first.preheader:
+; CHECK-NEXT:  br label %for.first
+; CHECK-LABEL: for.first:
+; CHECK:   br i1 %cmp.j, label %for.first, label %for.second.exit
+; CHECK-LABEL: for.second.exit:
+; CHECK-NEXT:  %add = add nsw i32 %x, 1
+; CHECK-NEXT:   br label %for.end
+; CHECK-LABEL: for.end:
+; CHECK-NEXT:   ret void
+define void @moveinsts_exitblock(i32* noalias %A, i32* noalias %B, i64 %N, i32 %x) {
+for.first.guard:
+  %cmp.guard = icmp slt i64 0, %N
+  br i1 %cmp.guard, label %for.first.preheader, label %for.second.guard
+
+for.first.preheader:
+  br label %for.first
+
+for.first:
+  %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04
+  store i32 0, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.04, 1
+  %cmp = icmp slt i64 %inc, %N
+  br i1 %cmp, label %for.first, label %for.first.exit
+
+for.first.exit:
+  %add = add nsw i32 %x, 1
+  br label %for.second.guard
+
+for.second.guard:
+  br i1 %cmp.guard, label %for.second.preheader, label %for.end
+
+for.second.preheader:
+  br label %for.second
+
+for.second:
+  %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ]
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02
+  store i32 0, i32* %arrayidx4, align 4
+  %inc6 = add nsw i64 %j.02, 1
+  %cmp.j = icmp slt i64 %inc6, %N
+  br i1 %cmp.j, label %for.second, label %for.second.exit
+
+for.second.exit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; Test that `%add` is moved in for.first.guard, and the two loops for.first
+; and for.second are fused.
+
+; CHECK: void @moveinsts_guardblock
+; CHECK-LABEL: for.first.guard:
+; CHECK-NEXT: %cmp.guard = icmp slt i64 0, %N
+; CHECK-NEXT:  %add = add nsw i32 %x, 1
+; CHECK: br i1 %cmp.guard, label %for.first.preheader, label %for.end
+; CHECK-LABEL: for.first.preheader:
+; CHECK-NEXT:  br label %for.first
+; CHECK-LABEL: for.first:
+; CHECK:   br i1 %cmp.j, label %for.first, label %for.second.exit
+; CHECK-LABEL: for.second.exit:
+; CHECK-NEXT:   br label %for.end
+; CHECK-LABEL: for.end:
+; CHECK-NEXT:   ret void
+define void @moveinsts_guardblock(i32* noalias %A, i32* noalias %B, i64 %N, i32 %x) {
+for.first.guard:
+  %cmp.guard = icmp slt i64 0, %N
+  br i1 %cmp.guard, label %for.first.preheader, label %for.second.guard
+
+for.first.preheader:
+  br label %for.first
+
+for.first:
+  %i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04
+  store i32 0, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.04, 1
+  %cmp = icmp slt i64 %inc, %N
+  br i1 %cmp, label %for.first, label %for.first.exit
+
+for.first.exit:
+  br label %for.second.guard
+
+for.second.guard:
+  %add = add nsw i32 %x, 1
+  br i1 %cmp.guard, label %for.second.preheader, label %for.end
+
+for.second.preheader:
+  br label %for.second
+
+for.second:
+  %j.02 = phi i64 [ %inc6, %for.second ], [ 0, %for.second.preheader ]
+  %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02
+  store i32 0, i32* %arrayidx4, align 4
+  %inc6 = add nsw i64 %j.02, 1
+  %cmp.j = icmp slt i64 %inc6, %N
+  br i1 %cmp.j, label %for.second, label %for.second.exit
+
+for.second.exit:
+  br label %for.end
+
+for.end:
+  ret void
+}