[llvm] f01d9e6 - [SimplifyCFG] Fix inconsistency in block size assessment for threading

Mon Jun 29 22:40:32 PDT 2020

Author: Max Kazantsev
Date: 2020-06-30T12:40:07+07:00
New Revision: f01d9e6fc3e291a2faed8c9b7dcbabf760f32bd6

URL: https://github.com/llvm/llvm-project/commit/f01d9e6fc3e291a2faed8c9b7dcbabf760f32bd6
DIFF: https://github.com/llvm/llvm-project/commit/f01d9e6fc3e291a2faed8c9b7dcbabf760f32bd6.diff

LOG: [SimplifyCFG] Fix inconsistency in block size assessment for threading

Sometimes SimplifyCFG may decide to perform jump threading. In order
to do it, it follows the following algorithm:

1. Checks if the block is small enough for threading;
2. If yes, inserts a PR Phi relying that the next iteration will remove it
   by performing jump threading;
3. The next iteration checks the block again and performs the threading.

This logic has a corner case: inserting the PR Phi increases block's size
by 1. If the block size at first check was max possible, one more Phi will
exceed this size, and we will neither perform threading nor remove the
created Phi node. As result, we will end up with worse IR than before.

This patch fixes this situation by excluding Phis from block size computation.
Excluding Phis from size computation for threading also makes sense by
itself because in case of threadign all those Phis will be removed.

Differential Revision: https://reviews.llvm.org/D81835
Reviewed By: asbirlea, nikic

Added: 
    

Modified: 
    llvm/lib/Transforms/Utils/SimplifyCFG.cpp
    llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index d899ba7a2690..e6ca52acb4a1 100644

--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -133,6 +133,11 @@ static cl::opt<unsigned> MaxSpeculationDepth(
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<int>
+MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10),
+                  cl::desc("Max size of a block which is still considered "
+                           "small enough to thread through"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -2189,12 +2194,15 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
 /// Return true if we can thread a branch across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
-  unsigned Size = 0;
+  int Size = 0;
 
   for (Instruction &I : BB->instructionsWithoutDebug()) {
-    if (Size > 10)
+    if (Size > MaxSmallBlockSize)
       return false; // Don't clone large BB's.
-    ++Size;
+    // We will delete Phis while threading, so Phis should not be accounted in
+    // block's size
+    if (!isa<PHINode>(I))
+      ++Size;
 
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.

diff  --git a/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll b/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll
index 461003717797..31ed7e203c45 100644
--- a/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll
+++ b/llvm/test/Transforms/SimplifyCFG/unprofitable-pr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -simplifycfg -S < %s | FileCheck %s
-; RUN: opt -passes=simplify-cfg -S < %s | FileCheck %s
+; RUN: opt -simplifycfg -simplifycfg-max-small-block-size=10 -S < %s | FileCheck %s
+; RUN: opt -passes=simplify-cfg -simplifycfg-max-small-block-size=10 -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64-p5:32:32-A5"
 
@@ -50,14 +50,71 @@ false2:                                           ; preds = %true1
   ret void
 }
 
-; FIXME: SimplifyCFG is doing something weird here. It should have split the
-; blocks like in the test above, but instead it creates .pr Phi node which
-; only complicates things.
+; Corner case: the block has max possible size for which we still do PRE.
 define void @test_02(i1 %c, i64* align 1 %ptr) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test_02(
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[TRUE2_CRITEDGE:%.*]], label [[FALSE1:%.*]]
+; CHECK:       false1:
+; CHECK-NEXT:    store volatile i64 1, i64* [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i64* [[PTR]] to i64
+; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 7
+; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    store volatile i64 0, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 3, i64* [[PTR]], align 8
+; CHECK-NEXT:    ret void
+; CHECK:       true2.critedge:
+; CHECK-NEXT:    [[PTRINT_C:%.*]] = ptrtoint i64* [[PTR]] to i64
+; CHECK-NEXT:    [[MASKEDPTR_C:%.*]] = and i64 [[PTRINT_C]], 7
+; CHECK-NEXT:    [[MASKCOND_C:%.*]] = icmp eq i64 [[MASKEDPTR_C]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND_C]])
+; CHECK-NEXT:    store volatile i64 0, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    store volatile i64 2, i64* [[PTR]], align 8
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %true1, label %false1
+
+true1:                                            ; preds = %false1, %0
+  %ptrint = ptrtoint i64* %ptr to i64
+  %maskedptr = and i64 %ptrint, 7
+  %maskcond = icmp eq i64 %maskedptr, 0
+  tail call void @llvm.assume(i1 %maskcond)
+  store volatile i64 0, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
+  br i1 %c, label %true2, label %false2
+
+false1:                                           ; preds = %0
+  store volatile i64 1, i64* %ptr, align 4
+  br label %true1
+
+true2:                                            ; preds = %true1
+  store volatile i64 2, i64* %ptr, align 8
+  ret void
+
+false2:                                           ; preds = %true1
+  store volatile i64 3, i64* %ptr, align 8
+  ret void
+}
+
+; This block is too huge for PRE.
+define void @test_03(i1 %c, i64* align 1 %ptr) local_unnamed_addr #0 {
+; CHECK-LABEL: @test_03(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[TRUE1:%.*]], label [[FALSE1:%.*]]
 ; CHECK:       true1:
-; CHECK-NEXT:    [[C_PR:%.*]] = phi i1 [ [[C]], [[FALSE1]] ], [ true, [[TMP0:%.*]] ]
 ; CHECK-NEXT:    [[PTRINT:%.*]] = ptrtoint i64* [[PTR:%.*]] to i64
 ; CHECK-NEXT:    [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 7
 ; CHECK-NEXT:    [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
@@ -68,7 +125,8 @@ define void @test_02(i1 %c, i64* align 1 %ptr) local_unnamed_addr #0 {
 ; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
 ; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
 ; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
-; CHECK-NEXT:    br i1 [[C_PR]], label [[TRUE2:%.*]], label [[FALSE2:%.*]]
+; CHECK-NEXT:    store volatile i64 -1, i64* [[PTR]], align 8
+; CHECK-NEXT:    br i1 [[C]], label [[TRUE2:%.*]], label [[FALSE2:%.*]]
 ; CHECK:       false1:
 ; CHECK-NEXT:    store volatile i64 1, i64* [[PTR]], align 4
 ; CHECK-NEXT:    br label [[TRUE1]]
@@ -92,6 +150,7 @@ true1:                                            ; preds = %false1, %0
   store volatile i64 -1, i64* %ptr, align 8
   store volatile i64 -1, i64* %ptr, align 8
   store volatile i64 -1, i64* %ptr, align 8
+  store volatile i64 -1, i64* %ptr, align 8
   br i1 %c, label %true2, label %false2
 
 false1:                                           ; preds = %0