[PATCH] D67281: [AArch64][SimplifyCFG] Add additional cost for instructions in mergeConditionalStoreToAddress

Pavel Kosov via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 20 05:24:22 PST 2019


kpdev42 updated this revision to Diff 230235.
kpdev42 added a comment.

I changed my patch, now before merging blocks it calculates existing instructions in DomBB, 
just to be sure that this block is not too big already
Maybe instead of using phi-node-folding threshold it is needed to use some other separate threshold.

And also it is still not clear, if we need to optimize this in a middle end or in a backend.
Maybe backend will be more proper place for it. @dmgreen please feel free to  bring a light to this problem

P.S.: In this change I also simplified a testcase

---------------------------------------------------------------------------------------------

Recall, what exactly I trying to eliminate: 
clang merges too big basic blocks and it leads to performance regression

Before merging:

          tst     x18, x1
          b.eq    .LBB0_10
  .LBB0_9:
          orr     x16, x16, x18
          add     w0, w0, #1
          str     xzr, [x13, #56]
  .LBB0_10:
          cbz     x11, .LBB0_7

After merging

  and     x2, x18, x1
  orr     x3, x16, x18
  tst     x18, x1
  orr     x2, x2, x11
  cinc    w0, w0, ne
  csel    x16, x16, x3, eq
  cbz     x2, .LBB0_7

------------------------------------------------------------------------------------------------

Similar picture in test case:
Before merging:

  for.body.lr.ph:                                   ; preds = %entry
    %0 = load %struct.ptr_wrapper*, %struct.ptr_wrapper** @g_wrapper, align 8
    %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %0, i64 0, i32 1
    %and2 = and i64 %mask, 1
    %tobool3 = icmp eq i64 %and2, 0
    %and = and i64 %bit, %in
    %tobool = icmp eq i64 %and, 0
    br i1 %tobool, label %if.end, label %if.then
  
  if.then:                                          ; preds = %for.body.lr.ph
    %or = or i64 0, %bit
    %inc = add nsw i32 0, 1
    br label %if.end
  
  if.end:                                           ; preds = %if.then, %for.body.lr.ph
    %retval1.1 = phi i32 [ %inc, %if.then ], [ 0, %for.body.lr.ph ]
    %res_in.1 = phi i64 [ %or, %if.then ], [ 0, %for.body.lr.ph ]
    %1 = xor i1 %tobool, true
    %2 = xor i1 %tobool3, true
    %3 = or i1 %1, %2
    store i8* null, i8** %proc, align 8
    %shl = shl i64 %bit, 1
    %cmp = icmp eq i64 %shl, 0
    br label %for.end

After merging:

  for.body.lr.ph:                                   ; preds = %entry
    %0 = load %struct.ptr_wrapper*, %struct.ptr_wrapper** @g_wrapper, align 8
    %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %0, i64 0, i32 1
    %and2 = and i64 %mask, 1
    %tobool3 = icmp eq i64 %and2, 0
    %and = and i64 %bit, %in
    %tobool = icmp eq i64 %and, 0
    %or = or i64 0, %bit
    %inc = add nsw i32 0, 1
    %retval1.1 = select i1 %tobool, i32 0, i32 %inc
    %res_in.1 = select i1 %tobool, i64 0, i64 %or
    %1 = xor i1 %tobool, true
    %2 = xor i1 %tobool3, true
    %3 = or i1 %1, %2
    store i8* null, i8** %proc, align 8
    %shl = shl i64 %bit, 1
    %cmp = icmp eq i64 %shl, 0
    br label %for.end

------------------------------------------------------------------------------------------------


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D67281/new/

https://reviews.llvm.org/D67281

Files:
  llvm/lib/Transforms/Utils/SimplifyCFG.cpp
  llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll


Index: llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SimplifyCFG/AArch64/check-instr-cost-for-folding.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -simplifycfg-db-instr-count=true -mtriple=aarch64-linux-gnu -simplifycfg -S >%t
+; RUN: FileCheck %s < %t
+; ModuleID = 'test_func.c'
+
+%struct.ptr_wrapper = type { i32*, i8* }
+
+ at g_wrapper = external dso_local local_unnamed_addr global %struct.ptr_wrapper*, align 8
+
+define dso_local i32 @test_func(i64 %in, i64 %bit, i64 %mask) local_unnamed_addr {
+entry:
+  %cmp16 = icmp eq i64 %bit, 0
+  br i1 %cmp16, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+; CHECK-LABEL: for.body.lr.ph:
+; CHECK-NOT: select
+  %0 = load %struct.ptr_wrapper*, %struct.ptr_wrapper** @g_wrapper, align 8
+  %proc = getelementptr inbounds %struct.ptr_wrapper, %struct.ptr_wrapper* %0, i64 0, i32 1
+  %and2 = and i64 %mask, 1
+  %tobool3 = icmp eq i64 %and2, 0
+  %and = and i64 %bit, %in
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body.lr.ph
+  %or = or i64 0, %bit
+  %inc = add nsw i32 0, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body.lr.ph
+  %retval1.1 = phi i32 [ %inc, %if.then ], [ 0, %for.body.lr.ph ]
+  %res_in.1 = phi i64 [ %or, %if.then ], [ 0, %for.body.lr.ph ]
+  %1 = xor i1 %tobool, true
+  %2 = xor i1 %tobool3, true
+  %3 = or i1 %1, %2
+  store i8* null, i8** %proc, align 8
+  %shl = shl i64 %bit, 1
+  %cmp = icmp eq i64 %shl, 0
+  br label %for.end
+
+for.end:                                          ; preds = %if.end, %entry
+  %retval1.0.lcssa = phi i32 [ 0, %entry ], [ %retval1.1, %if.end ]
+  %res_in.0.lcssa = phi i64 [ 0, %entry ], [ %res_in.1, %if.end ]
+  %4 = trunc i64 %res_in.0.lcssa to i32
+  %conv7 = add i32 %retval1.0.lcssa, %4
+  ret i32 %conv7
+}
Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -101,6 +101,10 @@
              "to speculatively execute to fold a 2-entry PHI node into a "
              "select (default = 4)"));
 
+static cl::opt<bool> CheckDomBlockInstructionsCount(
+    "simplifycfg-db-instr-count", cl::Hidden, cl::init(false),
+    cl::desc("Do not merge BBs if domblock already has more than phi-node-folding-threshold instructions"));
+
 static cl::opt<bool> DupRet(
     "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
     cl::desc("Duplicate return instructions into unconditional branches"));
@@ -2413,6 +2417,25 @@
                     << "  T: " << IfTrue->getName()
                     << "  F: " << IfFalse->getName() << "\n");
 
+  // We need to be sure, that DomBlock has
+  // enough room for new instructions
+  // First add cost of Select instruction, that will be added to this block
+  // (this cost is equal to number of phi nodes in BB)
+  unsigned Cost = NumPhis;
+
+  if (CheckDomBlockInstructionsCount) {
+    for (const auto& Instr : *DomBlock)
+    {
+      if (!isa<BranchInst>(&Instr))
+        Cost += TTI.getUserCost(&Instr);
+    }
+
+    if (Cost > PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic) {
+      // DomBlock already too large
+      return false;
+    }
+  }
+
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D67281.230235.patch
Type: text/x-patch
Size: 3687 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20191120/57e5265c/attachment-0001.bin>


More information about the llvm-commits mailing list