[llvm] r323277 - [TableGen] Optimize the regex search.
Benjamin Kramer via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 23 15:05:04 PST 2018
Author: d0k
Date: Tue Jan 23 15:05:04 2018
New Revision: 323277
URL: http://llvm.org/viewvc/llvm-project?rev=323277&view=rev
Log:
[TableGen] Optimize the regex search.
llvm::Regex is still the slowest regex engine on earth, running it over
all instructions on X86 takes a while. Extract a prefix and use a binary
search to reduce the search space before we resort to regex matching.
There are a couple of caveats here:
- The generic opcodes are outside of the sorted enum. They're handled in an extra loop.
- If there's a top-level bar we can't use the prefix trick.
- We bail on top-level ?. This could be handled, but it's rare.
This brings the time to generate X86GenInstrInfo.inc from 21s to 4.7s on
my machine.
Modified:
llvm/trunk/utils/TableGen/CodeGenSchedule.cpp
Modified: llvm/trunk/utils/TableGen/CodeGenSchedule.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/TableGen/CodeGenSchedule.cpp?rev=323277&r1=323276&r2=323277&view=diff
==============================================================================
--- llvm/trunk/utils/TableGen/CodeGenSchedule.cpp (original)
+++ llvm/trunk/utils/TableGen/CodeGenSchedule.cpp Tue Jan 23 15:05:04 2018
@@ -12,17 +12,18 @@
//
//===----------------------------------------------------------------------===//
-#include "CodeGenInstruction.h"
#include "CodeGenSchedule.h"
+#include "CodeGenInstruction.h"
#include "CodeGenTarget.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/TableGen/Error.h"
#include <algorithm>
#include <iterator>
@@ -50,36 +51,91 @@ struct InstrsOp : public SetTheory::Oper
};
// (instregex "OpcPat",...) Find all instructions matching an opcode pattern.
-//
-// TODO: Since this is a prefix match, perform a binary search over the
-// instruction names using lower_bound. Note that the predefined instrs must be
-// scanned linearly first. However, this is only safe if the regex pattern has
-// no top-level bars. The DAG already has a list of patterns, so there's no
-// reason to use top-level bars, but we need a way to verify they don't exist
-// before implementing the optimization.
struct InstRegexOp : public SetTheory::Operator {
const CodeGenTarget &Target;
InstRegexOp(const CodeGenTarget &t): Target(t) {}
+ /// Remove any text inside of parentheses from S.
+ static std::string removeParens(llvm::StringRef S) {
+ std::string Result;
+ unsigned Paren = 0;
+ // NB: We don't care about escaped parens here.
+ for (char C : S) {
+ switch (C) {
+ case '(':
+ ++Paren;
+ break;
+ case ')':
+ --Paren;
+ break;
+ default:
+ if (Paren == 0)
+ Result += C;
+ }
+ }
+ return Result;
+ }
+
void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
ArrayRef<SMLoc> Loc) override {
- SmallVector<Regex, 4> RegexList;
+ SmallVector<std::pair<StringRef, Optional<Regex>>, 4> RegexList;
for (Init *Arg : make_range(Expr->arg_begin(), Expr->arg_end())) {
StringInit *SI = dyn_cast<StringInit>(Arg);
if (!SI)
- PrintFatalError(Loc, "instregex requires pattern string: "
- + Expr->getAsString());
- std::string pat = SI->getValue();
- // Implement a python-style prefix match.
+ PrintFatalError(Loc, "instregex requires pattern string: " +
+ Expr->getAsString());
+ // Extract a prefix that we can binary search on.
+ static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
+ auto FirstMeta = SI->getValue().find_first_of(RegexMetachars);
+ // Look for top-level | or ?. We cannot optimize them to binary search.
+ if (removeParens(SI->getValue()).find_first_of("|?") != std::string::npos)
+ FirstMeta = 0;
+ StringRef Prefix = SI->getValue().substr(0, FirstMeta);
+ std::string pat = SI->getValue().substr(FirstMeta);
+ if (pat.empty()) {
+ RegexList.push_back(std::make_pair(Prefix, None));
+ continue;
+ }
+ // For the rest use a python-style prefix match.
if (pat[0] != '^') {
pat.insert(0, "^(");
pat.insert(pat.end(), ')');
}
- RegexList.push_back(Regex(pat));
+ RegexList.push_back(std::make_pair(Prefix, Regex(pat)));
}
- for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
- for (auto &R : RegexList) {
- if (R.match(Inst->TheDef->getName()))
+ for (auto &R : RegexList) {
+ // The generic opcodes are unsorted, handle them manually.
+ for (auto *Inst : Target.getInstructionsByEnumValue().slice(
+ 0, TargetOpcode::GENERIC_OP_END + 1)) {
+ if (Inst->TheDef->getName().startswith(R.first) &&
+ (!R.second ||
+ R.second->match(Inst->TheDef->getName().substr(R.first.size()))))
+ Elts.insert(Inst->TheDef);
+ }
+
+ ArrayRef<const CodeGenInstruction *> Instructions =
+ Target.getInstructionsByEnumValue().slice(
+ TargetOpcode::GENERIC_OP_END + 1);
+
+ // Target instructions are sorted. Find the range that starts with our
+ // prefix.
+ struct Comp {
+ bool operator()(const CodeGenInstruction *LHS, StringRef RHS) {
+ return LHS->TheDef->getName() < RHS;
+ }
+ bool operator()(StringRef LHS, const CodeGenInstruction *RHS) {
+ return LHS < RHS->TheDef->getName() &&
+ !RHS->TheDef->getName().startswith(LHS);
+ }
+ };
+ auto Range = std::equal_range(Instructions.begin(), Instructions.end(),
+ R.first, Comp());
+
+ // For this range we know that it starts with the prefix. Check if there's
+ // a regex that needs to be checked.
+ for (auto *Inst : make_range(Range)) {
+ if (!R.second ||
+ R.second->match(Inst->TheDef->getName().substr(R.first.size())))
Elts.insert(Inst->TheDef);
}
}
More information about the llvm-commits
mailing list