[llvm] r323277 - [TableGen] Optimize the regex search.

Benjamin Kramer via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 23 15:05:04 PST 2018


Author: d0k
Date: Tue Jan 23 15:05:04 2018
New Revision: 323277

URL: http://llvm.org/viewvc/llvm-project?rev=323277&view=rev
Log:
[TableGen] Optimize the regex search.

llvm::Regex is still the slowest regex engine on earth, running it over
all instructions on X86 takes a while. Extract a prefix and use a binary
search to reduce the search space before we resort to regex matching.

There are a couple of caveats here:
- The generic opcodes are outside of the sorted enum. They're handled in an extra loop.
- If there's a top-level bar we can't use the prefix trick.
- We bail on top-level ?. This could be handled, but it's rare.

This brings the time to generate X86GenInstrInfo.inc from 21s to 4.7s on
my machine.

Modified:
    llvm/trunk/utils/TableGen/CodeGenSchedule.cpp

Modified: llvm/trunk/utils/TableGen/CodeGenSchedule.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/utils/TableGen/CodeGenSchedule.cpp?rev=323277&r1=323276&r2=323277&view=diff
==============================================================================
--- llvm/trunk/utils/TableGen/CodeGenSchedule.cpp (original)
+++ llvm/trunk/utils/TableGen/CodeGenSchedule.cpp Tue Jan 23 15:05:04 2018
@@ -12,17 +12,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "CodeGenInstruction.h"
 #include "CodeGenSchedule.h"
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include <algorithm>
 #include <iterator>
@@ -50,36 +51,91 @@ struct InstrsOp : public SetTheory::Oper
 };
 
 // (instregex "OpcPat",...) Find all instructions matching an opcode pattern.
-//
-// TODO: Since this is a prefix match, perform a binary search over the
-// instruction names using lower_bound. Note that the predefined instrs must be
-// scanned linearly first. However, this is only safe if the regex pattern has
-// no top-level bars. The DAG already has a list of patterns, so there's no
-// reason to use top-level bars, but we need a way to verify they don't exist
-// before implementing the optimization.
 struct InstRegexOp : public SetTheory::Operator {
   const CodeGenTarget &Target;
   InstRegexOp(const CodeGenTarget &t): Target(t) {}
 
+  /// Remove any text inside of parentheses from S.
+  static std::string removeParens(llvm::StringRef S) {
+    std::string Result;
+    unsigned Paren = 0;
+    // NB: We don't care about escaped parens here.
+    for (char C : S) {
+      switch (C) {
+      case '(':
+        ++Paren;
+        break;
+      case ')':
+        --Paren;
+        break;
+      default:
+        if (Paren == 0)
+          Result += C;
+      }
+    }
+    return Result;
+  }
+
   void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
              ArrayRef<SMLoc> Loc) override {
-    SmallVector<Regex, 4> RegexList;
+    SmallVector<std::pair<StringRef, Optional<Regex>>, 4> RegexList;
     for (Init *Arg : make_range(Expr->arg_begin(), Expr->arg_end())) {
       StringInit *SI = dyn_cast<StringInit>(Arg);
       if (!SI)
-        PrintFatalError(Loc, "instregex requires pattern string: "
-          + Expr->getAsString());
-      std::string pat = SI->getValue();
-      // Implement a python-style prefix match.
+        PrintFatalError(Loc, "instregex requires pattern string: " +
+                                 Expr->getAsString());
+      // Extract a prefix that we can binary search on.
+      static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
+      auto FirstMeta = SI->getValue().find_first_of(RegexMetachars);
+      // Look for top-level | or ?. We cannot optimize them to binary search.
+      if (removeParens(SI->getValue()).find_first_of("|?") != std::string::npos)
+        FirstMeta = 0;
+      StringRef Prefix = SI->getValue().substr(0, FirstMeta);
+      std::string pat = SI->getValue().substr(FirstMeta);
+      if (pat.empty()) {
+        RegexList.push_back(std::make_pair(Prefix, None));
+        continue;
+      }
+      // For the rest use a python-style prefix match.
       if (pat[0] != '^') {
         pat.insert(0, "^(");
         pat.insert(pat.end(), ')');
       }
-      RegexList.push_back(Regex(pat));
+      RegexList.push_back(std::make_pair(Prefix, Regex(pat)));
     }
-    for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue()) {
-      for (auto &R : RegexList) {
-        if (R.match(Inst->TheDef->getName()))
+    for (auto &R : RegexList) {
+      // The generic opcodes are unsorted, handle them manually.
+      for (auto *Inst : Target.getInstructionsByEnumValue().slice(
+               0, TargetOpcode::GENERIC_OP_END + 1)) {
+        if (Inst->TheDef->getName().startswith(R.first) &&
+            (!R.second ||
+             R.second->match(Inst->TheDef->getName().substr(R.first.size()))))
+          Elts.insert(Inst->TheDef);
+      }
+
+      ArrayRef<const CodeGenInstruction *> Instructions =
+          Target.getInstructionsByEnumValue().slice(
+              TargetOpcode::GENERIC_OP_END + 1);
+
+      // Target instructions are sorted. Find the range that starts with our
+      // prefix.
+      struct Comp {
+        bool operator()(const CodeGenInstruction *LHS, StringRef RHS) {
+          return LHS->TheDef->getName() < RHS;
+        }
+        bool operator()(StringRef LHS, const CodeGenInstruction *RHS) {
+          return LHS < RHS->TheDef->getName() &&
+                 !RHS->TheDef->getName().startswith(LHS);
+        }
+      };
+      auto Range = std::equal_range(Instructions.begin(), Instructions.end(),
+                                    R.first, Comp());
+
+      // For this range we know that it starts with the prefix. Check if there's
+      // a regex that needs to be checked.
+      for (auto *Inst : make_range(Range)) {
+        if (!R.second ||
+            R.second->match(Inst->TheDef->getName().substr(R.first.size())))
           Elts.insert(Inst->TheDef);
       }
     }




More information about the llvm-commits mailing list