[llvm-commits] [llvm] r78492 - /llvm/trunk/utils/TableGen/AsmMatcherEmitter.cpp
Daniel Dunbar
daniel at zuster.org
Sat Aug 8 13:55:44 PDT 2009
On Sat, Aug 8, 2009 at 1:02 PM, Chris Lattner<sabre at nondot.org> wrote:
> Author: lattner
> Date: Sat Aug 8 15:02:57 2009
> New Revision: 78492
>
> URL: http://llvm.org/viewvc/llvm-project?rev=78492&view=rev
> Log:
> add a little function to do arbitrary string pattern matching in a
> much more efficient way than a sequence of if's. Switch MatchRegisterName
> to use it. It would be nice if someone could factor this out to a shared
> place in tblgen :)
Cool!
> +typedef std::pair<std::string, std::string> StringPair;
Not that it matters, but at least in the context of this functionality
I think this can use a StringRef; it always deals with substrings of
the existing strings, right?
> +/// EmitStringMatcherForChar - Given a set of strings that are known to be the
> +/// same length and whose characters leading up to CharNo are the same, emit
> +/// code to verify that CharNo and later are the same.
> +static void EmitStringMatcherForChar(const std::string &StrVariableName,
> + const std::vector<const StringPair*> &Matches,
> + unsigned CharNo, unsigned IndentCount,
> + raw_ostream &OS) {
> + assert(!Matches.empty() && "Must have at least one string to match!");
> + std::string Indent(IndentCount*2+4, ' ');
> +
> + // If we have verified that the entire string matches, we're done: output the
> + // matching code.
> + if (CharNo == Matches[0]->first.size()) {
> + assert(Matches.size() == 1 && "Had duplicate keys to match on");
> +
> + // FIXME: If Matches[0].first has embeded \n, this will be bad.
> + OS << Indent << Matches[0]->second << "\t // \"" << Matches[0]->first
> + << "\"\n";
> + return;
> + }
> +
> + // Bucket the matches by the character we are comparing.
> + std::map<char, std::vector<const StringPair*> > MatchesByLetter;
> +
> + for (unsigned i = 0, e = Matches.size(); i != e; ++i)
> + MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches[i]);
> +
> +
> + // If we have exactly one bucket to match, see how many characters are common
> + // across the whole set and match all of them at once.
> + // length, just verify the rest of it with one if.
Edito.
I think another similar simple optimization which can be done is to
match common suffixes. See the code that gets generated for "st(0)",
"st(1)", etc.
> + if (MatchesByLetter.size() == 1) {
> + unsigned FirstNonCommonLetter = FindFirstNonCommonLetter(Matches);
> + unsigned NumChars = FirstNonCommonLetter-CharNo;
> +
> + if (NumChars == 1) {
> + // Do the comparison with if (Str[1] == 'f')
> + // FIXME: Need to escape general characters.
> + OS << Indent << "if (" << StrVariableName << "[" << CharNo << "] == '"
> + << Matches[0]->first[CharNo] << "') {\n";
> + } else {
> + // Do the comparison with if (Str.substr(1,3) == "foo").
> + OS << Indent << "if (" << StrVariableName << ".substr(" << CharNo << ","
> + << NumChars << ") == \"";
> +
> + // FIXME: Need to escape general strings.
> + OS << Matches[0]->first.substr(CharNo, NumChars) << "\") {\n";
> + }
> +
> + EmitStringMatcherForChar(StrVariableName, Matches, FirstNonCommonLetter,
> + IndentCount+1, OS);
> + OS << Indent << "}\n";
> + return;
> + }
> +
> + // Otherwise, we have multiple possible things, emit a switch on the
> + // character.
> + OS << Indent << "switch (" << StrVariableName << "[" << CharNo << "]) {\n";
> + OS << Indent << "default: break;\n";
> +
> + for (std::map<char, std::vector<const StringPair*> >::iterator LI =
> + MatchesByLetter.begin(), E = MatchesByLetter.end(); LI != E; ++LI) {
> + // TODO: escape hard stuff (like \n) if we ever care about it.
> + OS << Indent << "case '" << LI->first << "':\t // "
> + << LI->second.size() << " strings to match.\n";
> + EmitStringMatcherForChar(StrVariableName, LI->second, CharNo+1,
> + IndentCount+1, OS);
> + OS << Indent << " break;\n";
> + }
> +
> + OS << Indent << "}\n";
> +
> +}
> +
> +
> +/// EmitStringMatcher - Given a list of strings and code to execute when they
> +/// match, output a simple switch tree to classify the input string. If a
> +/// match is found, the code in Vals[i].second is executed. This code should do
> +/// a return to avoid falling through. If nothing matches, execution falls
> +/// through. StrVariableName is the name of teh variable to test.
teh -> the.
IMHO, we should just implement this as a (string -> unsigned) matcher.
Thats very frequently the use case, and when it isn't you aren't
necessarily worse off by using it as (string -> unsigned -> my generic
code), and you end up with more readable code (instead of intertwining
the generic actions with the matching code).
This lets the matcher implement assorted fun optimizations, like:
1. { "0" -> a + b*0, "1" -> n + b*1, ... "9" -> n + b*9} to { "[0-9]"
-> a + (char - '0') * b}.
2. { "foo" -> 1, "bar" -> 1, "baz" -> 1, etc -> 0 } into a hash match.
> +static void EmitStringMatcher(const std::string &StrVariableName,
> + const std::vector<StringPair> &Matches,
> + raw_ostream &OS) {
> + // First level categorization: group strings by length.
> + std::map<unsigned, std::vector<const StringPair*> > MatchesByLength;
> +
> + for (unsigned i = 0, e = Matches.size(); i != e; ++i)
> + MatchesByLength[Matches[i].first.size()].push_back(&Matches[i]);
> +
> + // Output a switch statement on length and categorize the elements within each
> + // bin.
> + OS << " switch (" << StrVariableName << ".size()) {\n";
> + OS << " default: break;\n";
> +
> +
> + for (std::map<unsigned, std::vector<const StringPair*> >::iterator LI =
> + MatchesByLength.begin(), E = MatchesByLength.end(); LI != E; ++LI) {
> + OS << " case " << LI->first << ":\t // " << LI->second.size()
> + << " strings to match.\n";
> + EmitStringMatcherForChar(StrVariableName, LI->second, 0, 0, OS);
> + OS << " break;\n";
> + }
> +
> +
> + OS << " }\n";
> +}
> +
> +
> +
> /// EmitMatchRegisterName - Emit the function to match a string to the target
> /// specific register enum.
> static void EmitMatchRegisterName(CodeGenTarget &Target, Record *AsmParser,
> @@ -740,16 +871,20 @@
> << AsmParser->getValueAsString("AsmParserClassName")
> << "::MatchRegisterName(const StringRef &Name, unsigned &RegNo) {\n";
>
> + std::vector<StringPair> Matches;
> +
> // FIXME: TableGen should have a fast string matcher generator.
- FIXME? :)
> for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
> const CodeGenRegister &Reg = Registers[i];
> if (Reg.TheDef->getValueAsString("AsmName").empty())
> continue;
>
> - OS << " if (Name == \""
> - << Reg.TheDef->getValueAsString("AsmName") << "\")\n"
> - << " return RegNo=" << i + 1 << ", false;\n";
> + Matches.push_back(StringPair(Reg.TheDef->getValueAsString("AsmName"),
> + "RegNo=" + utostr(i + 1) + "; return false;"));
> }
> +
> + EmitStringMatcher("Name", Matches, OS);
> +
> OS << " return true;\n";
> OS << "}\n\n";
> }
Very cool, thanks.
- Daniel
More information about the llvm-commits
mailing list