[llvm] r312315 - [MergeICmps] MergeICmps is a new optimization pass that turns chains of integer
Clement Courbet via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 1 02:07:05 PDT 2017
Author: courbet
Date: Fri Sep 1 02:07:05 2017
New Revision: 312315
URL: http://llvm.org/viewvc/llvm-project?rev=312315&view=rev
Log:
[MergeICmps] MergeICmps is a new optimization pass that turns chains of integer
comparisons into memcmp.
Thanks to recent improvements in the LLVM codegen, the memcmp is typically
inlined as a chain of efficient hardware comparisons.
This typically benefits C++ member or nonmember operator==().
For now this is disabled by default until:
- https://bugs.llvm.org/show_bug.cgi?id=33329 is complete
- Benchmarks show that this is always useful.
Differential Revision:
https://reviews.llvm.org/D33987
Added:
llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp
llvm/trunk/test/Transforms/MergeICmps/
llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll
llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll
llvm/trunk/test/Transforms/MergeICmps/volatile.ll
Modified:
llvm/trunk/include/llvm/InitializePasses.h
llvm/trunk/include/llvm/LinkAllPasses.h
llvm/trunk/include/llvm/Transforms/Scalar.h
llvm/trunk/lib/CodeGen/TargetPassConfig.cpp
llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt
llvm/trunk/lib/Transforms/Scalar/Scalar.cpp
Modified: llvm/trunk/include/llvm/InitializePasses.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/InitializePasses.h?rev=312315&r1=312314&r2=312315&view=diff
==============================================================================
--- llvm/trunk/include/llvm/InitializePasses.h (original)
+++ llvm/trunk/include/llvm/InitializePasses.h Fri Sep 1 02:07:05 2017
@@ -256,6 +256,7 @@ void initializeMemorySSAPrinterLegacyPas
void initializeMemorySSAWrapperPassPass(PassRegistry&);
void initializeMemorySanitizerPass(PassRegistry&);
void initializeMergeFunctionsPass(PassRegistry&);
+void initializeMergeICmpsPass(PassRegistry&);
void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
void initializeMetaRenamerPass(PassRegistry&);
void initializeModuleDebugInfoPrinterPass(PassRegistry&);
Modified: llvm/trunk/include/llvm/LinkAllPasses.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/LinkAllPasses.h?rev=312315&r1=312314&r2=312315&view=diff
==============================================================================
--- llvm/trunk/include/llvm/LinkAllPasses.h (original)
+++ llvm/trunk/include/llvm/LinkAllPasses.h Fri Sep 1 02:07:05 2017
@@ -179,6 +179,7 @@ namespace {
(void) llvm::createPostOrderFunctionAttrsLegacyPass();
(void) llvm::createReversePostOrderFunctionAttrsPass();
(void) llvm::createMergeFunctionsPass();
+ (void) llvm::createMergeICmpsPass();
std::string buf;
llvm::raw_string_ostream os(buf);
(void) llvm::createPrintModulePass(os);
Modified: llvm/trunk/include/llvm/Transforms/Scalar.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Transforms/Scalar.h?rev=312315&r1=312314&r2=312315&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Transforms/Scalar.h (original)
+++ llvm/trunk/include/llvm/Transforms/Scalar.h Fri Sep 1 02:07:05 2017
@@ -422,6 +422,12 @@ Pass *createLowerGuardIntrinsicPass();
//===----------------------------------------------------------------------===//
//
+// MergeICmps - Merge integer comparison chains
+//
+Pass *createMergeICmpsPass();
+
+//===----------------------------------------------------------------------===//
+//
// ValuePropagation - Propagate CFG-derived value information
//
Pass *createCorrelatedValuePropagationPass();
Modified: llvm/trunk/lib/CodeGen/TargetPassConfig.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetPassConfig.cpp?rev=312315&r1=312314&r2=312315&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetPassConfig.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetPassConfig.cpp Fri Sep 1 02:07:05 2017
@@ -94,6 +94,10 @@ static cl::opt<bool> EnableImplicitNullC
"enable-implicit-null-checks",
cl::desc("Fold null checks into faulting memory operations"),
cl::init(false));
+static cl::opt<bool> EnableMergeICmps(
+ "enable-mergeicmps",
+ cl::desc("Merge ICmp chains into a single memcmp"),
+ cl::init(false));
static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
cl::desc("Print LLVM IR produced by the loop-reduce pass"));
static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
@@ -591,6 +595,10 @@ void TargetPassConfig::addIRPasses() {
addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
}
+ if (getOptLevel() != CodeGenOpt::None && EnableMergeICmps) {
+ addPass(createMergeICmpsPass());
+ }
+
// Run GC lowering passes for builtin collectors
// TODO: add a pass insertion point here
addPass(createGCLoweringPass());
Modified: llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt?rev=312315&r1=312314&r2=312315&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt (original)
+++ llvm/trunk/lib/Transforms/Scalar/CMakeLists.txt Fri Sep 1 02:07:05 2017
@@ -42,6 +42,7 @@ add_llvm_library(LLVMScalarOpts
LowerExpectIntrinsic.cpp
LowerGuardIntrinsic.cpp
MemCpyOptimizer.cpp
+ MergeICmps.cpp
MergedLoadStoreMotion.cpp
NaryReassociate.cpp
NewGVN.cpp
Added: llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp?rev=312315&view=auto
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp (added)
+++ llvm/trunk/lib/Transforms/Scalar/MergeICmps.cpp Fri Sep 1 02:07:05 2017
@@ -0,0 +1,641 @@
+//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns chains of integer comparisons into memcmp (the memcmp is
+// later typically inlined as a chain of efficient hardware comparisons). This
+// typically benefits c++ member or nonmember operator==().
+//
+// The basic idea is to replace a larger chain of integer comparisons loaded
+// from contiguous memory locations into a smaller chain of such integer
+// comparisons. Benefits are double:
+// - There are less jumps, and therefore less opportunities for mispredictions
+// and I-cache misses.
+// - Code size is smaller, both because jumps are removed and because the
+// encoding of a 2*n byte compare is smaller than that of two n-byte
+// compares.
+
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "mergeicmps"
+
+#define MERGEICMPS_DOT_ON
+
+// A BCE atom.
+struct BCEAtom {
+ const Value *Base() const { return GEP ? GEP->getPointerOperand() : nullptr; }
+
+ bool operator<(const BCEAtom &O) const {
+ return Base() == O.Base() ? Offset.slt(O.Offset) : Base() < O.Base();
+ }
+
+ GetElementPtrInst *GEP = nullptr;
+ LoadInst *LoadI = nullptr;
+ APInt Offset;
+};
+
+// If this value is a load from a constant offset w.r.t. a base address, and
+// there are no othe rusers of the load or address, returns the base address and
+// the offset.
+BCEAtom visitICmpLoadOperand(Value *const Val) {
+ BCEAtom Result;
+ if (auto *const LoadI = dyn_cast<LoadInst>(Val)) {
+ DEBUG(dbgs() << "load\n");
+ if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+ DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ if (LoadI->isVolatile()) {
+ DEBUG(dbgs() << "volatile\n");
+ return {};
+ }
+ Value *const Addr = LoadI->getOperand(0);
+ if (auto *const GEP = dyn_cast<GetElementPtrInst>(Addr)) {
+ DEBUG(dbgs() << "GEP\n");
+ if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+ DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ const auto &DL = GEP->getModule()->getDataLayout();
+ if (!isDereferenceablePointer(GEP, DL)) {
+ DEBUG(dbgs() << "not dereferenceable\n");
+ // We need to make sure that we can do comparison in any order, so we
+ // require memory to be unconditionnally dereferencable.
+ return {};
+ }
+ Result.Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+ if (GEP->accumulateConstantOffset(DL, Result.Offset)) {
+ Result.GEP = GEP;
+ Result.LoadI = LoadI;
+ }
+ }
+ }
+ return Result;
+}
+
+// A basic block with a comparison between two BCE atoms.
+// Note: the terminology is misleading: the comparison is symmetric, so there
+// is no real {l/r}hs. To break the symmetry, we use the smallest atom as Lhs.
+class BCECmpBlock {
+ public:
+ BCECmpBlock() {}
+
+ BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
+ : Lhs_(L), Rhs_(R), SizeBits_(SizeBits) {
+ if (Rhs_ < Lhs_)
+ std::swap(Rhs_, Lhs_);
+ }
+
+ bool IsValid() const {
+ return Lhs_.Base() != nullptr && Rhs_.Base() != nullptr;
+ }
+
+ // Assert the the block is consistent: If valid, it should also have
+ // non-null members besides Lhs_ and Rhs_.
+ void AssertConsistent() const {
+ if (IsValid()) {
+ assert(BB);
+ assert(CmpI);
+ assert(BranchI);
+ }
+ }
+
+ const BCEAtom &Lhs() const { return Lhs_; }
+ const BCEAtom &Rhs() const { return Rhs_; }
+ int SizeBits() const { return SizeBits_; }
+
+ // Returns true if the block does other works besides comparison.
+ bool doesOtherWork() const;
+
+ // The basic block where this comparison happens.
+ BasicBlock *BB = nullptr;
+ // The ICMP for this comparison.
+ ICmpInst *CmpI = nullptr;
+ // The terminating branch.
+ BranchInst *BranchI = nullptr;
+
+ private:
+ BCEAtom Lhs_;
+ BCEAtom Rhs_;
+ int SizeBits_ = 0;
+};
+
+bool BCECmpBlock::doesOtherWork() const {
+ AssertConsistent();
+ // TODO(courbet): Can we allow some other things ? This is very conservative.
+ // We might be able to get away with anything does does not have any side
+ // effects outside of the basic block.
+ // Note: The GEPs and/or loads are not necessarily in the same block.
+ for (const Instruction &Inst : *BB) {
+ if (const auto *const GEP = dyn_cast<GetElementPtrInst>(&Inst)) {
+ if (!(Lhs_.GEP == GEP || Rhs_.GEP == GEP))
+ return true;
+ } else if (const auto *const L = dyn_cast<LoadInst>(&Inst)) {
+ if (!(Lhs_.LoadI == L || Rhs_.LoadI == L))
+ return true;
+ } else if (const auto *const C = dyn_cast<ICmpInst>(&Inst)) {
+ if (C != CmpI)
+ return true;
+ } else if (const auto *const Br = dyn_cast<BranchInst>(&Inst)) {
+ if (Br != BranchI)
+ return true;
+ } else {
+ return true;
+ }
+ }
+ return false;
+}
+
+// Visit the given comparison. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitICmp(const ICmpInst *const CmpI,
+ const ICmpInst::Predicate ExpectedPredicate) {
+ if (CmpI->getPredicate() == ExpectedPredicate) {
+ DEBUG(dbgs() << "cmp "
+ << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+ << "\n");
+ auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0));
+ if (!Lhs.Base())
+ return {};
+ auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1));
+ if (!Rhs.Base())
+ return {};
+ return BCECmpBlock(std::move(Lhs), std::move(Rhs),
+ CmpI->getOperand(0)->getType()->getScalarSizeInBits());
+ }
+ return {};
+}
+
+// Visit the given comparison block. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
+ const BasicBlock *const PhiBlock) {
+ if (Block->empty())
+ return {};
+ auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
+ if (!BranchI)
+ return {};
+ DEBUG(dbgs() << "branch\n");
+ if (BranchI->isUnconditional()) {
+ // In this case, we expect an incoming value which is the result of the
+ // comparison. This is the last link in the chain of comparisons (note
+ // that this does not mean that this is the last incoming value, blocks
+ // can be reordered).
+ auto *const CmpI = dyn_cast<ICmpInst>(Val);
+ if (!CmpI)
+ return {};
+ DEBUG(dbgs() << "icmp\n");
+ auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ } else {
+ // In this case, we expect a constant incoming value (the comparison is
+ // chained).
+ const auto *const Const = dyn_cast<ConstantInt>(Val);
+ DEBUG(dbgs() << "const\n");
+ if (!Const->isZero())
+ return {};
+ DEBUG(dbgs() << "false\n");
+ auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
+ if (!CmpI)
+ return {};
+ DEBUG(dbgs() << "icmp\n");
+ assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
+ BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
+ auto Result = visitICmp(
+ CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ }
+ return {};
+}
+
+// A chain of comparisons.
+class BCECmpChain {
+ public:
+ BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi);
+
+ int size() const { return Comparisons_.size(); }
+
+#ifdef MERGEICMPS_DOT_ON
+ void dump() const;
+#endif // MERGEICMPS_DOT_ON
+
+ bool simplify(const TargetLibraryInfo *const TLI);
+
+ private:
+ static bool IsContiguous(const BCECmpBlock &First,
+ const BCECmpBlock &Second) {
+ return First.Lhs().Base() == Second.Lhs().Base() &&
+ First.Rhs().Base() == Second.Rhs().Base() &&
+ First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
+ First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+ }
+
+ // Merges the given comparison blocks into one memcmp block and update
+ // branches. Comparisons are assumed to be continguous. If NextBBInChain is
+ // null, the merged block will link to the phi block.
+ static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain, PHINode &Phi,
+ const TargetLibraryInfo *const TLI);
+
+ PHINode &Phi_;
+ std::vector<BCECmpBlock> Comparisons_;
+ // The original entry block (before sorting);
+ BasicBlock *EntryBlock_;
+};
+
+BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
+ : Phi_(Phi) {
+ // Now look inside blocks to check for BCE comparisons.
+ std::vector<BCECmpBlock> Comparisons;
+ for (BasicBlock *Block : Blocks) {
+ BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
+ Block, Phi.getParent());
+ Comparison.BB = Block;
+ if (!Comparison.IsValid()) {
+ DEBUG(dbgs() << "skip: not a valid BCECmpBlock\n");
+ return;
+ }
+ if (Comparison.doesOtherWork()) {
+ DEBUG(dbgs() << "block does extra work besides compare\n");
+ if (Comparisons.empty()) { // First block.
+ // TODO(courbet): The first block can do other things, and we should
+ // split them apart in a separate block before the comparison chain.
+ // Right now we just discard it and make the chain shorter.
+ DEBUG(dbgs()
+ << "ignoring first block that does extra work besides compare\n");
+ continue;
+ }
+ // TODO(courbet): Right now we abort the whole chain. We could be
+ // merging only the blocks that don't do other work and resume the
+ // chain from there. For example:
+ // if (a[0] == b[0]) { // bb1
+ // if (a[1] == b[1]) { // bb2
+ // some_value = 3; //bb3
+ // if (a[2] == b[2]) { //bb3
+ // do a ton of stuff //bb4
+ // }
+ // }
+ // }
+ //
+ // This is:
+ //
+ // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // We can only merge the first two comparisons, because bb3* does
+ // "other work" (setting some_value to 3).
+ // We could still merge bb1 and bb2 though.
+ return;
+ }
+ DEBUG(dbgs() << "*Found cmp of " << Comparison.SizeBits()
+ << " bits between " << Comparison.Lhs().Base() << " + "
+ << Comparison.Lhs().Offset << " and "
+ << Comparison.Rhs().Base() << " + " << Comparison.Rhs().Offset
+ << "\n");
+ DEBUG(dbgs() << "\n");
+ Comparisons.push_back(Comparison);
+ }
+ EntryBlock_ = Comparisons[0].BB;
+ Comparisons_ = std::move(Comparisons);
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "BEFORE REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+ // Reorder blocks by LHS. We can do that without changing the
+ // semantics because we are only accessing dereferencable memory.
+ std::sort(Comparisons_.begin(), Comparisons_.end(),
+ [](const BCECmpBlock &a, const BCECmpBlock &b) {
+ return a.Lhs() < b.Lhs();
+ });
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "AFTER REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+}
+
+#ifdef MERGEICMPS_DOT_ON
+void BCECmpChain::dump() const {
+ errs() << "digraph dag {\n";
+ errs() << " graph [bgcolor=transparent];\n";
+ errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
+ errs() << " edge [color=black];\n";
+ for (size_t I = 0; I < Comparisons_.size(); ++I) {
+ const auto &Comparison = Comparisons_[I];
+ errs() << " \"" << I << "\" [label=\"%"
+ << Comparison.Lhs().Base()->getName() << " + "
+ << Comparison.Lhs().Offset << " == %"
+ << Comparison.Rhs().Base()->getName() << " + "
+ << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
+ << " bytes)\"];\n";
+ const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
+ if (I > 0)
+ errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
+ errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
+ }
+ errs() << " \"Phi\" [label=\"Phi\"];\n";
+ errs() << "}\n\n";
+}
+#endif // MERGEICMPS_DOT_ON
+
+bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
+ // First pass to check if there is at least one merge. If not, we don't do
+ // anything and we keep analysis passes intact.
+ {
+ bool AtLeastOneMerged = false;
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+ AtLeastOneMerged = true;
+ break;
+ }
+ }
+ if (!AtLeastOneMerged)
+ return false;
+ }
+
+ // Remove phi references to comparison blocks, they will be rebuilt as we
+ // merge the blocks.
+ for (const auto &Comparison : Comparisons_) {
+ Phi_.removeIncomingValue(Comparison.BB, false);
+ }
+
+ // Point the predecessors of the chain to the first comparison block (which is
+ // the new entry point).
+ if (EntryBlock_ != Comparisons_[0].BB)
+ EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+
+ // Effectively merge blocks.
+ int NumMerged = 1;
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (IsContiguous(Comparisons_[I - 1], Comparisons_[I])) {
+ ++NumMerged;
+ } else {
+ // Merge all previous comparisons and start a new merge block.
+ mergeComparisons(
+ makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
+ Comparisons_[I].BB, Phi_, TLI);
+ NumMerged = 1;
+ }
+ }
+ mergeComparisons(makeArrayRef(Comparisons_)
+ .slice(Comparisons_.size() - NumMerged, NumMerged),
+ nullptr, Phi_, TLI);
+
+ return true;
+}
+
+void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const NextBBInChain,
+ PHINode &Phi,
+ const TargetLibraryInfo *const TLI) {
+ assert(!Comparisons.empty());
+ const auto &FirstComparison = *Comparisons.begin();
+ BasicBlock *const BB = FirstComparison.BB;
+ LLVMContext &Context = BB->getContext();
+
+ if (Comparisons.size() >= 2) {
+ DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
+ const auto TotalSize =
+ std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
+ [](int Size, const BCECmpBlock &C) {
+ return Size + C.SizeBits();
+ }) /
+ 8;
+
+ // Incoming edges do not need to be updated, and both GEPs are already
+ // computing the right address, we just need to:
+ // - replace the two loads and the icmp with the memcmp
+ // - update the branch
+ // - update the incoming values in the phi.
+ FirstComparison.BranchI->eraseFromParent();
+ FirstComparison.CmpI->eraseFromParent();
+ FirstComparison.Lhs().LoadI->eraseFromParent();
+ FirstComparison.Rhs().LoadI->eraseFromParent();
+
+ IRBuilder<> Builder(BB);
+ const auto &DL = Phi.getModule()->getDataLayout();
+ Value *const MemCmpCall =
+ emitMemCmp(FirstComparison.Lhs().GEP, FirstComparison.Rhs().GEP,
+ ConstantInt::get(DL.getIntPtrType(Context), TotalSize),
+ Builder, DL, TLI);
+ Value *const MemCmpIsZero = Builder.CreateICmpEQ(
+ MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
+
+ // Add a branch to the next basic block in the chain.
+ if (NextBBInChain) {
+ Builder.CreateCondBr(MemCmpIsZero, NextBBInChain, Phi.getParent());
+ Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+ } else {
+ Builder.CreateBr(Phi.getParent());
+ Phi.addIncoming(MemCmpIsZero, BB);
+ }
+
+ // Delete merged blocks.
+ for (size_t I = 1; I < Comparisons.size(); ++I) {
+ BasicBlock *CBB = Comparisons[I].BB;
+ CBB->replaceAllUsesWith(BB);
+ CBB->eraseFromParent();
+ }
+ } else {
+ assert(Comparisons.size() == 1);
+ // There are no blocks to merge, but we still need to update the branches.
+ DEBUG(dbgs() << "Only one comparison, updating branches\n");
+ if (NextBBInChain) {
+ if (FirstComparison.BranchI->isConditional()) {
+ DEBUG(dbgs() << "conditional -> conditional\n");
+ // Just update the "true" target, the "false" target should already be
+ // the phi block.
+ assert(FirstComparison.BranchI->getSuccessor(1) == Phi.getParent());
+ FirstComparison.BranchI->setSuccessor(0, NextBBInChain);
+ Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+ } else {
+ DEBUG(dbgs() << "unconditional -> conditional\n");
+ // Replace the unconditional branch by a conditional one.
+ FirstComparison.BranchI->eraseFromParent();
+ IRBuilder<> Builder(BB);
+ Builder.CreateCondBr(FirstComparison.CmpI, NextBBInChain,
+ Phi.getParent());
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ }
+ } else {
+ if (FirstComparison.BranchI->isConditional()) {
+ DEBUG(dbgs() << "conditional -> unconditional\n");
+ // Replace the conditional branch by an unconditional one.
+ FirstComparison.BranchI->eraseFromParent();
+ IRBuilder<> Builder(BB);
+ Builder.CreateBr(Phi.getParent());
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ } else {
+ DEBUG(dbgs() << "unconditional -> unconditional\n");
+ Phi.addIncoming(FirstComparison.CmpI, BB);
+ }
+ }
+ }
+}
+
+std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
+ BasicBlock *const LastBlock,
+ int NumBlocks) {
+ // Walk up from the last block to find other blocks.
+ std::vector<BasicBlock *> Blocks(NumBlocks);
+ BasicBlock *CurBlock = LastBlock;
+ for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
+ if (CurBlock->hasAddressTaken()) {
+ // Somebody is jumping to the block through an address, all bets are
+ // off.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has its address taken\n");
+ return {};
+ }
+ Blocks[BlockIndex] = CurBlock;
+ auto *SinglePredecessor = CurBlock->getSinglePredecessor();
+ if (!SinglePredecessor) {
+ // The block has two or more predecessors.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has two or more predecessors\n");
+ return {};
+ }
+ if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
+ // The block does not link back to the phi.
+ DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " does not link back to the phi\n");
+ return {};
+ }
+ CurBlock = SinglePredecessor;
+ }
+ Blocks[0] = CurBlock;
+ return Blocks;
+}
+
+bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
+ DEBUG(dbgs() << "processPhi()\n");
+ if (Phi.getNumIncomingValues() <= 1) {
+ DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+ return false;
+ }
+ // We are looking for something that has the following structure:
+ // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // - The last basic block (bb4 here) must branch unconditionally to bb_phi.
+ // It's the only block that contributes a non-constant value to the Phi.
+ // - All other blocks (b1, b2, b3) must have exactly two successors, one of
+ // them being the the phi block.
+ // - All intermediate blocks (bb2, bb3) must have only one predecessor.
+ // - Blocks cannot do other work besides the comparison, see doesOtherWork()
+
+ // The blocks are not necessarily ordered in the phi, so we start from the
+ // last block and reconstruct the order.
+ BasicBlock *LastBlock = nullptr;
+ for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
+ if (isa<ConstantInt>(Phi.getIncomingValue(I)))
+ continue;
+ if (LastBlock) {
+ // There are several non-constant values.
+ DEBUG(dbgs() << "skip: several non-constant values\n");
+ return false;
+ }
+ LastBlock = Phi.getIncomingBlock(I);
+ }
+ if (!LastBlock) {
+ // There is no non-constant block.
+ DEBUG(dbgs() << "skip: no non-constant block\n");
+ return false;
+ }
+ if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
+ DEBUG(dbgs() << "skip: last block non-phi successor\n");
+ return false;
+ }
+
+ const auto Blocks =
+ getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
+ if (Blocks.empty())
+ return false;
+ BCECmpChain CmpChain(Blocks, Phi);
+
+ if (CmpChain.size() < 2) {
+ DEBUG(dbgs() << "skip: only one compare block\n");
+ return false;
+ }
+
+ return CmpChain.simplify(TLI);
+}
+
+class MergeICmps : public FunctionPass {
+ public:
+ static char ID;
+
+ MergeICmps() : FunctionPass(ID) {
+ initializeMergeICmpsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F)) return false;
+ const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+ auto PA = runImpl(F, &TLI);
+ return !PA.areAllPreserved();
+ }
+
+ private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI);
+};
+
+PreservedAnalyses MergeICmps::runImpl(Function &F,
+ const TargetLibraryInfo *TLI) {
+ DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
+
+ bool MadeChange = false;
+
+ for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
+ // A Phi operation is always first in a basic block.
+ if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
+ MadeChange |= processPhi(*Phi, TLI);
+ }
+
+ if (MadeChange)
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+} // namespace
+
+char MergeICmps::ID = 0;
+INITIALIZE_PASS_BEGIN(MergeICmps, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+
+Pass *llvm::createMergeICmpsPass() { return new MergeICmps(); }
+
Modified: llvm/trunk/lib/Transforms/Scalar/Scalar.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/Scalar.cpp?rev=312315&r1=312314&r2=312315&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/Scalar.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/Scalar.cpp Fri Sep 1 02:07:05 2017
@@ -73,6 +73,7 @@ void llvm::initializeScalarOpts(PassRegi
initializeLowerExpectIntrinsicPass(Registry);
initializeLowerGuardIntrinsicLegacyPassPass(Registry);
initializeMemCpyOptLegacyPassPass(Registry);
+ initializeMergeICmpsPass(Registry);
initializeMergedLoadStoreMotionLegacyPassPass(Registry);
initializeNaryReassociateLegacyPassPass(Registry);
initializePartiallyInlineLibCallsLegacyPassPass(Registry);
Added: llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll?rev=312315&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll (added)
+++ llvm/trunk/test/Transforms/MergeICmps/pair-int32-int32.ll Fri Sep 1 02:07:05 2017
@@ -0,0 +1,87 @@
+; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
+
+%"struct.std::pair" = type { i32, i32 }
+
+define zeroext i1 @opeq1(
+ %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
+ %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
+entry:
+ %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+ %0 = load i32, i32* %first.i, align 4
+ %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+ %1 = load i32, i32* %first1.i, align 4
+ %cmp.i = icmp eq i32 %0, %1
+ br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+ %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+ %2 = load i32, i32* %second.i, align 4
+ %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+ %3 = load i32, i32* %second2.i, align 4
+ %cmp3.i = icmp eq i32 %2, %3
+ br label %opeq1.exit
+
+opeq1.exit:
+ %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
+ ret i1 %4
+; CHECK-LABEL: @opeq1(
+; The entry block with zero-offset GEPs is kept, loads are removed.
+; CHECK: entry
+; CHECK: getelementptr {{.*}} i32 0
+; CHECK-NOT: load
+; CHECK: getelementptr {{.*}} i32 0
+; CHECK-NOT: load
+; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
+; CHECK: @memcmp({{.*}}8)
+; CHECK: icmp eq {{.*}} 0
+; The branch is now a direct branch; the other block has been removed.
+; CHECK: br label %opeq1.exit
+; CHECK-NOT: br
+; The phi is updated.
+; CHECK: phi i1 [ %{{[^,]*}}, %entry ]
+; CHECK-NEXT: ret
+}
+
+; Same as above, but the two blocks are in inverse order.
+define zeroext i1 @opeq1_inverse(
+ %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
+ %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
+entry:
+ %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+ %0 = load i32, i32* %first.i, align 4
+ %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+ %1 = load i32, i32* %first1.i, align 4
+ %cmp.i = icmp eq i32 %0, %1
+ br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+ %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+ %2 = load i32, i32* %second.i, align 4
+ %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+ %3 = load i32, i32* %second2.i, align 4
+ %cmp3.i = icmp eq i32 %2, %3
+ br label %opeq1.exit
+
+opeq1.exit:
+ %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
+ ret i1 %4
+; CHECK-LABEL: @opeq1_inverse(
+; The second block with zero-offset GEPs is kept, loads are removed.
+; CHECK: land.rhs.i
+; CHECK: getelementptr {{.*}} i32 0
+; CHECK-NOT: load
+; CHECK: getelementptr {{.*}} i32 0
+; CHECK-NOT: load
+; The two 4 byte loads and compares are replaced with a single 8-byte memcmp.
+; CHECK: @memcmp({{.*}}8)
+; CHECK: icmp eq {{.*}} 0
+; The branch is now a direct branch; the other block has been removed.
+; CHECK: br label %opeq1.exit
+; CHECK-NOT: br
+; The phi is updated.
+; CHECK: phi i1 [ %{{[^,]*}}, %land.rhs.i ]
+; CHECK-NEXT: ret
+}
+
+
+
Added: llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll?rev=312315&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll (added)
+++ llvm/trunk/test/Transforms/MergeICmps/tuple-four-int8.ll Fri Sep 1 02:07:05 2017
@@ -0,0 +1,73 @@
+; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
+
+; This is a more involved test: clang generates this weird pattern for
+; tuple<uint8_t, uint8_t, uint8_t, uint8_t>. Right now we skip the entry block
+; (which defines the base pointer for other blocks) and the last one (which
+; does not have the expected structure). Only middle blocks (bytes [1,2]) are
+; merged.
+
+%"class.std::tuple" = type { %"struct.std::_Tuple_impl" }
+%"struct.std::_Tuple_impl" = type { %"struct.std::_Tuple_impl.0", %"struct.std::_Head_base.6" }
+%"struct.std::_Tuple_impl.0" = type { %"struct.std::_Tuple_impl.1", %"struct.std::_Head_base.5" }
+%"struct.std::_Tuple_impl.1" = type { %"struct.std::_Tuple_impl.2", %"struct.std::_Head_base.4" }
+%"struct.std::_Tuple_impl.2" = type { %"struct.std::_Head_base" }
+%"struct.std::_Head_base" = type { i8 }
+%"struct.std::_Head_base.4" = type { i8 }
+%"struct.std::_Head_base.5" = type { i8 }
+%"struct.std::_Head_base.6" = type { i8 }
+
+define zeroext i1 @opeq(
+ %"class.std::tuple"* nocapture readonly dereferenceable(4) %a,
+ %"class.std::tuple"* nocapture readonly dereferenceable(4) %b) local_unnamed_addr #1 {
+entry:
+ %0 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %a, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
+ %add.ptr.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 3
+ %1 = load i8, i8* %add.ptr.i.i.i.i.i, align 1
+ %2 = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* %b, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
+ %add.ptr.i.i.i6.i.i = getelementptr inbounds i8, i8* %2, i64 3
+ %3 = load i8, i8* %add.ptr.i.i.i6.i.i, align 1
+ %cmp.i.i = icmp eq i8 %1, %3
+ br i1 %cmp.i.i, label %land.rhs.i.i, label %opeq.exit
+
+land.rhs.i.i:
+ %add.ptr.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 2
+ %4 = load i8, i8* %add.ptr.i.i.i.i.i.i, align 1
+ %add.ptr.i.i.i6.i.i.i = getelementptr inbounds i8, i8* %2, i64 2
+ %5 = load i8, i8* %add.ptr.i.i.i6.i.i.i, align 1
+ %cmp.i.i.i = icmp eq i8 %4, %5
+ br i1 %cmp.i.i.i, label %land.rhs.i.i.i, label %opeq.exit
+
+land.rhs.i.i.i:
+ %add.ptr.i.i.i.i.i.i.i = getelementptr inbounds i8, i8* %0, i64 1
+ %6 = load i8, i8* %add.ptr.i.i.i.i.i.i.i, align 1
+ %add.ptr.i.i.i6.i.i.i.i = getelementptr inbounds i8, i8* %2, i64 1
+ %7 = load i8, i8* %add.ptr.i.i.i6.i.i.i.i, align 1
+ %cmp.i.i.i.i = icmp eq i8 %6, %7
+ br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %opeq.exit
+
+land.rhs.i.i.i.i:
+ %8 = load i8, i8* %0, align 1
+ %9 = load i8, i8* %2, align 1
+ %cmp.i.i.i.i.i = icmp eq i8 %8, %9
+ br label %opeq.exit
+
+opeq.exit:
+ %10 = phi i1 [ false, %entry ], [ false, %land.rhs.i.i ], [ false, %land.rhs.i.i.i ], [ %cmp.i.i.i.i.i, %land.rhs.i.i.i.i ]
+ ret i1 %10
+; CHECK-LABEL: @opeq(
+; The entry block is kept as is, but the next block is now the merged comparison
+; block for bytes [1,2] or the block for the head.
+; CHECK: entry
+; CHECK: br i1 %cmp.i.i, label %land.rhs.i.i.i{{(.i)?}}, label %opeq.exit
+; The two 1 byte loads and compares at offset 1 are replaced with a single
+; 2-byte memcmp.
+; CHECK: land.rhs.i.i.i
+; CHECK: @memcmp({{.*}}2)
+; CHECK: icmp eq {{.*}} 0
+; In the end we have three blocks.
+; CHECK: phi i1
+; CHECK-SAME %entry
+; CHECK-SAME %land.rhs.i.i.i.i
+; CHECK-SAME %land.rhs.i.i.i
+}
+
Added: llvm/trunk/test/Transforms/MergeICmps/volatile.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/MergeICmps/volatile.ll?rev=312315&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/MergeICmps/volatile.ll (added)
+++ llvm/trunk/test/Transforms/MergeICmps/volatile.ll Fri Sep 1 02:07:05 2017
@@ -0,0 +1,30 @@
+; RUN: opt -mergeicmps -S -o - %s | FileCheck %s
+
+%"struct.std::pair" = type { i32, i32 }
+
+define zeroext i1 @opeq(
+ %"struct.std::pair"* nocapture readonly dereferenceable(8) %a,
+ %"struct.std::pair"* nocapture readonly dereferenceable(8) %b) local_unnamed_addr #0 {
+entry:
+ %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+ %0 = load i32, i32* %first.i, align 4
+ %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+ %1 = load i32, i32* %first1.i, align 4
+ %cmp.i = icmp eq i32 %0, %1
+ br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+ %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+ %2 = load volatile i32, i32* %second.i, align 4
+ %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+ %3 = load i32, i32* %second2.i, align 4
+ %cmp3.i = icmp eq i32 %2, %3
+ br label %opeq1.exit
+
+opeq1.exit:
+ %4 = phi i1 [ false, %entry ], [ %cmp3.i, %land.rhs.i ]
+ ret i1 %4
+; CHECK-LABEL: @opeq(
+; CHECK-NOT: memcmp
+}
+
More information about the llvm-commits
mailing list