[llvm] r329564 - [MergeICmp] Split blocks that do other work.
Xin Tong via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 9 06:14:06 PDT 2018
Author: trentxintong
Date: Mon Apr 9 06:14:06 2018
New Revision: 329564
URL: http://llvm.org/viewvc/llvm-project?rev=329564&view=rev
Log:
[MergeICmp] Split blocks that do other work.
Summary:
We do not try to move the instructions and split the block till we
know the blocks can be split, i.e. BCE-cmp-insts can be separated from
non-BCE-cmp-insts.
Reviewers: davide, courbet
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D44443
Added:
llvm/trunk/test/Transforms/MergeICmps/X86/split-block-does-work.ll
Modified:
llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp
llvm/trunk/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
Modified: llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp?rev=329564&r1=329563&r2=329564&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp Mon Apr 9 06:14:06 2018
@@ -110,6 +110,10 @@ BCEAtom visitICmpLoadOperand(Value *cons
}
// A basic block with a comparison between two BCE atoms.
+// The block might do extra work besides the atom comparison, in which case
+// doesOtherWork() returns true. Under some conditions, the block can be
+// split into the atom comparison part and the "other work" part
+// (see canSplit()).
// Note: the terminology is misleading: the comparison is symmetric, so there
// is no real {l/r}hs. What we want though is to have the same base on the
// left (resp. right), so that we can detect consecutive loads. To ensure this
@@ -144,19 +148,83 @@ class BCECmpBlock {
// Returns true if the block does other works besides comparison.
bool doesOtherWork() const;
+ // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
+ // instructions in the block.
+ bool canSplit() const;
+
+ // Return true if this all the relevant instructions in the BCE-cmp-block can
+ // be sunk below this instruction. By doing this, we know we can separate the
+ // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
+ // block.
+ bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &) const;
+
+ // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
+ // instructions. Split the old block and move all non-BCE-cmp-insts into the
+ // new parent block.
+ void split(BasicBlock *NewParent) const;
+
// The basic block where this comparison happens.
BasicBlock *BB = nullptr;
// The ICMP for this comparison.
ICmpInst *CmpI = nullptr;
// The terminating branch.
BranchInst *BranchI = nullptr;
+ // The block requires splitting.
+ bool RequireSplit = false;
- private:
+private:
BCEAtom Lhs_;
BCEAtom Rhs_;
int SizeBits_ = 0;
};
+bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
+ DenseSet<Instruction *> &BlockInsts) const {
+ // If this instruction has side effects and its in middle of the BCE cmp block
+ // instructions, then bail for now.
+ // TODO: use alias analysis to tell whether there is real interference.
+ if (Inst->mayHaveSideEffects())
+ return false;
+ // Make sure this instruction does not use any of the BCE cmp block
+ // instructions as operand.
+ for (auto BI : BlockInsts) {
+ if (is_contained(Inst->operands(), BI))
+ return false;
+ }
+ return true;
+}
+
+void BCECmpBlock::split(BasicBlock *NewParent) const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ llvm::SmallVector<Instruction *, 4> OtherInsts;
+ for (Instruction &Inst : *BB) {
+ if (BlockInsts.count(&Inst))
+ continue;
+ assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block");
+ // This is a non-BCE-cmp-block instruction. And it can be separated
+ // from the BCE-cmp-block instruction.
+ OtherInsts.push_back(&Inst);
+ }
+
+ // Do the actual spliting.
+ for (Instruction *Inst : reverse(OtherInsts)) {
+ Inst->moveBefore(&*NewParent->begin());
+ }
+}
+
+bool BCECmpBlock::canSplit() const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ for (Instruction &Inst : *BB) {
+ if (!BlockInsts.count(&Inst)) {
+ if (!canSinkBCECmpInst(&Inst, BlockInsts))
+ return false;
+ }
+ }
+ return true;
+}
+
bool BCECmpBlock::doesOtherWork() const {
AssertConsistent();
// All the instructions we care about in the BCE cmp block.
@@ -241,6 +309,17 @@ BCECmpBlock visitCmpBlock(Value *const V
return {};
}
+static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
+ BCECmpBlock &Comparison) {
+ DEBUG(dbgs() << "Block '" << Comparison.BB->getName() << "': Found cmp of "
+ << Comparison.SizeBits() << " bits between "
+ << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset
+ << " and " << Comparison.Rhs().Base() << " + "
+ << Comparison.Rhs().Offset << "\n");
+ DEBUG(dbgs() << "\n");
+ Comparisons.push_back(Comparison);
+}
+
// A chain of comparisons.
class BCECmpChain {
public:
@@ -266,9 +345,9 @@ class BCECmpChain {
// Merges the given comparison blocks into one memcmp block and update
// branches. Comparisons are assumed to be continguous. If NextBBInChain is
// null, the merged block will link to the phi block.
- static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
- BasicBlock *const NextBBInChain, PHINode &Phi,
- const TargetLibraryInfo *const TLI);
+ void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain, PHINode &Phi,
+ const TargetLibraryInfo *const TLI);
PHINode &Phi_;
std::vector<BCECmpBlock> Comparisons_;
@@ -295,12 +374,28 @@ BCECmpChain::BCECmpChain(const std::vect
DEBUG(dbgs() << "block '" << Comparison.BB->getName()
<< "' does extra work besides compare\n");
if (Comparisons.empty()) {
- // TODO(courbet): The initial block can do other things, and we should
- // split them apart in a separate block before the comparison chain.
- // Right now we just discard it and make the chain shorter.
- DEBUG(dbgs()
- << "ignoring initial block '" << Comparison.BB->getName()
- << "' that does extra work besides compare\n");
+ // This is the initial block in the chain, in case this block does other
+ // work, we can try to split the block and move the irrelevant
+ // instructions to the predecessor.
+ //
+ // If this is not the initial block in the chain, splitting it wont
+ // work.
+ //
+ // As once split, there will still be instructions before the BCE cmp
+ // instructions that do other work in program order, i.e. within the
+ // chain before sorting. Unless we can abort the chain at this point
+ // and start anew.
+ //
+ // NOTE: we only handle block with single predecessor for now.
+ if (Comparison.canSplit()) {
+ DEBUG(dbgs() << "Split initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ Comparison.RequireSplit = true;
+ enqueueBlock(Comparisons, Comparison);
+ } else {
+ DEBUG(dbgs() << "ignoring initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ }
continue;
}
// TODO(courbet): Right now we abort the whole chain. We could be
@@ -328,13 +423,7 @@ BCECmpChain::BCECmpChain(const std::vect
// We could still merge bb1 and bb2 though.
return;
}
- DEBUG(dbgs() << "Block '" << Comparison.BB->getName()<< "': Found cmp of "
- << Comparison.SizeBits() << " bits between "
- << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset
- << " and " << Comparison.Rhs().Base() << " + "
- << Comparison.Rhs().Offset << "\n");
- DEBUG(dbgs() << "\n");
- Comparisons.push_back(Comparison);
+ enqueueBlock(Comparisons, Comparison);
}
// It is possible we have no suitable comparison to merge.
@@ -416,9 +505,11 @@ bool BCECmpChain::simplify(const TargetL
}
// Point the predecessors of the chain to the first comparison block (which is
- // the new entry point).
- if (EntryBlock_ != Comparisons_[0].BB)
+ // the new entry point) and update the entry block of the chain.
+ if (EntryBlock_ != Comparisons_[0].BB) {
EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+ EntryBlock_ = Comparisons_[0].BB;
+ }
// Effectively merge blocks.
int NumMerged = 1;
@@ -450,6 +541,14 @@ void BCECmpChain::mergeComparisons(Array
LLVMContext &Context = BB->getContext();
if (Comparisons.size() >= 2) {
+ // If there is one block that requires splitting, we do it now, i.e.
+ // just before we know we will collapse the chain. The instructions
+ // can be executed before any of the instructions in the chain.
+ auto C = std::find_if(Comparisons.begin(), Comparisons.end(),
+ [](const BCECmpBlock &B) { return B.RequireSplit; });
+ if (C != Comparisons.end())
+ C->split(EntryBlock_);
+
DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
const auto TotalSize =
std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
Added: llvm/trunk/test/Transforms/MergeICmps/X86/split-block-does-work.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/MergeICmps/X86/split-block-does-work.ll?rev=329564&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/MergeICmps/X86/split-block-does-work.ll (added)
+++ llvm/trunk/test/Transforms/MergeICmps/X86/split-block-does-work.ll Mon Apr 9 06:14:06 2018
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s --check-prefix=X86
+
+%"struct.std::pair" = type { i32, i32, i32, i32 }
+
+declare void @foo(...) nounwind readnone
+
+; We can split %entry and create a memcmp(16 bytes).
+define zeroext i1 @opeq1(
+; X86-LABEL: @opeq1(
+;
+; Make sure this call is moved to the beginning of the entry block.
+; X86: entry:
+; X86-NEXT: call void (...) @foo()
+; X86-NEXT: [[THIRD_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0
+; X86-NEXT: [[THIRD1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0
+; X86-NEXT: [[CSTR:%.*]] = bitcast i32* [[THIRD_I]] to i8*
+; X86-NEXT: [[CSTR1:%.*]] = bitcast i32* [[THIRD1_I]] to i8*
+; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16)
+; X86-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; X86-NEXT: br label [[OPEQ1_EXIT:%.*]]
+;
+ %"struct.std::pair"* nocapture readonly dereferenceable(16) %a,
+ %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
+entry:
+ %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+ %0 = load i32, i32* %first.i, align 4
+ %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+ %1 = load i32, i32* %first1.i, align 4
+ ; Does other work.
+ call void (...) @foo()
+ %cmp.i = icmp eq i32 %0, %1
+ br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+ %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+ %2 = load i32, i32* %second.i, align 4
+ %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+ %3 = load i32, i32* %second2.i, align 4
+ %cmp2.i = icmp eq i32 %2, %3
+ br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
+
+land.rhs.i.2:
+ %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2
+ %4 = load i32, i32* %third.i, align 4
+ %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 2
+ %5 = load i32, i32* %third2.i, align 4
+ %cmp3.i = icmp eq i32 %4, %5
+ br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
+
+land.rhs.i.3:
+ %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 3
+ %6 = load i32, i32* %fourth.i, align 4
+ %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3
+ %7 = load i32, i32* %fourth2.i, align 4
+ %cmp4.i = icmp eq i32 %6, %7
+ br label %opeq1.exit
+
+opeq1.exit:
+ %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ]
+ ret i1 %8
+}
+
+
+; We will not be able to merge anything, make sure the call is not moved out.
+define zeroext i1 @opeq1_discontiguous(
+; X86-LABEL: @opeq1_discontiguous(
+;
+; Make sure this call is moved in the entry block.
+; X86: entry:
+; X86: [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 1
+; X86: [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0
+; X86: call void (...) @foo()
+ %"struct.std::pair"* nocapture readonly dereferenceable(16) %a,
+ %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
+entry:
+ %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+ %0 = load i32, i32* %first.i, align 4
+ %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+ %1 = load i32, i32* %first1.i, align 4
+ ; Does other work.
+ call void (...) @foo()
+ %cmp.i = icmp eq i32 %0, %1
+ br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+ %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2
+ %2 = load i32, i32* %second.i, align 4
+ %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+ %3 = load i32, i32* %second2.i, align 4
+ %cmp2.i = icmp eq i32 %2, %3
+ br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
+
+land.rhs.i.2:
+ %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2
+ %4 = load i32, i32* %third.i, align 4
+ %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3
+ %5 = load i32, i32* %third2.i, align 4
+ %cmp3.i = icmp eq i32 %4, %5
+ br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
+
+land.rhs.i.3:
+ %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+ %6 = load i32, i32* %fourth.i, align 4
+ %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3
+ %7 = load i32, i32* %fourth2.i, align 4
+ %cmp4.i = icmp eq i32 %6, %7
+ br label %opeq1.exit
+
+opeq1.exit:
+ %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ]
+ ret i1 %8
+}
Modified: llvm/trunk/test/Transforms/MergeICmps/X86/tuple-four-int8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/MergeICmps/X86/tuple-four-int8.ll?rev=329564&r1=329563&r2=329564&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/MergeICmps/X86/tuple-four-int8.ll (original)
+++ llvm/trunk/test/Transforms/MergeICmps/X86/tuple-four-int8.ll Mon Apr 9 06:14:06 2018
@@ -19,28 +19,24 @@
define zeroext i1 @opeq(
; CHECK-LABEL: @opeq(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
-; CHECK-NEXT: [[A_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 3
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[A_ELEM3_ADDR]], align 1
-; CHECK-NEXT: [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
-; CHECK-NEXT: [[B_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 3
-; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[B_ELEM3_ADDR]], align 1
-; CHECK-NEXT: [[CMP_ELEM3:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
-; CHECK-NEXT: br i1 [[CMP_ELEM3]], label [[LAND_ELEM0:%.*]], label [[OPEQ_EXIT:%.*]]
+;
+; These 2 instructions are split. Then we can merge 3 bytes, instead of 2.
+; CHECK: br label [[LAND_ELEM0:%.*]]
; CHECK: land.elem1:
-; CHECK-NEXT: [[A_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 1
-; CHECK-NEXT: [[B_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 1
-; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 2)
+; CHECK-NEXT: [[A_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* %a.base, i64 1
+; CHECK-NEXT: [[B_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* %b.base, i64 1
+; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 3)
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; CHECK-NEXT: br label [[OPEQ_EXIT]]
+; CHECK-NEXT: br label [[OPEQ_EXIT:%.*]]
; CHECK: land.elem0:
+; CHECK: [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
+; CHECK: [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[A_BASE]], align 1
; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[B_BASE]], align 1
; CHECK-NEXT: [[CMP_ELEM0:%.*]] = icmp eq i8 [[TMP3]], [[TMP4]]
; CHECK-NEXT: br i1 [[CMP_ELEM0]], label [[LAND_ELEM1:%.*]], label [[OPEQ_EXIT]]
; CHECK: opeq.exit:
-; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ]
; CHECK-NEXT: ret i1 [[TMP5]]
;
%"class.std::tuple"* nocapture readonly dereferenceable(4) %a,
More information about the llvm-commits
mailing list