[llvm-commits] [llvm] r78492 - /llvm/trunk/utils/TableGen/AsmMatcherEmitter.cpp

Sat Aug 8 15:48:04 PDT 2009

On Aug 8, 2009, at 3:19 PM, Evan Cheng wrote:

> Nice. Can we use it to fix up dagisel?

Does dagisel match strings anywhere?

-Chris

>
> Evan
>
> On Aug 8, 2009, at 1:55 PM, Daniel Dunbar <daniel at zuster.org> wrote:
>
>> On Sat, Aug 8, 2009 at 1:02 PM, Chris Lattner<sabre at nondot.org>  
>> wrote:
>>> Author: lattner
>>> Date: Sat Aug  8 15:02:57 2009
>>> New Revision: 78492
>>>
>>> URL: http://llvm.org/viewvc/llvm-project?rev=78492&view=rev
>>> Log:
>>> add a little function to do arbitrary string pattern matching in a
>>> much more efficient way than a sequence of if's.  Switch
>>> MatchRegisterName
>>> to use it.  It would be nice if someone could factor this out to a
>>> shared
>>> place in tblgen :)
>>
>> Cool!
>>
>>> +typedef std::pair<std::string, std::string> StringPair;
>>
>> Not that it matters, but at least in the context of this  
>> functionality
>> I think this can use a StringRef; it always deals with substrings of
>> the existing strings, right?
>>
>>> +/// EmitStringMatcherForChar - Given a set of strings that are
>>> known to be the
>>> +/// same length and whose characters leading up to CharNo are the
>>> same, emit
>>> +/// code to verify that CharNo and later are the same.
>>> +static void EmitStringMatcherForChar(const std::string
>>> &StrVariableName,
>>> +                                  const std::vector<const
>>> StringPair*> &Matches,
>>> +                                     unsigned CharNo, unsigned
>>> IndentCount,
>>> +                                     raw_ostream &OS) {
>>> +  assert(!Matches.empty() && "Must have at least one string to
>>> match!");
>>> +  std::string Indent(IndentCount*2+4, ' ');
>>> +
>>> +  // If we have verified that the entire string matches, we're
>>> done: output the
>>> +  // matching code.
>>> +  if (CharNo == Matches[0]->first.size()) {
>>> +    assert(Matches.size() == 1 && "Had duplicate keys to match  
>>> on");
>>> +
>>> +    // FIXME: If Matches[0].first has embeded \n, this will be bad.
>>> +    OS << Indent << Matches[0]->second << "\t // \"" << Matches[0]-
>>>> first
>>> +       << "\"\n";
>>> +    return;
>>> +  }
>>> +
>>> +  // Bucket the matches by the character we are comparing.
>>> +  std::map<char, std::vector<const StringPair*> > MatchesByLetter;
>>> +
>>> +  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
>>> +    MatchesByLetter[Matches[i]->first[CharNo]].push_back(Matches
>>> [i]);
>>> +
>>> +
>>> +  // If we have exactly one bucket to match, see how many
>>> characters are common
>>> +  // across the whole set and match all of them at once.
>>> +  // length, just verify the rest of it with one if.
>>
>> Edito.
>>
>> I think another similar simple optimization which can be done is to
>> match common suffixes. See the code that gets generated for "st(0)",
>> "st(1)", etc.
>>
>>> +  if (MatchesByLetter.size() == 1) {
>>> +    unsigned FirstNonCommonLetter = FindFirstNonCommonLetter
>>> (Matches);
>>> +    unsigned NumChars = FirstNonCommonLetter-CharNo;
>>> +
>>> +    if (NumChars == 1) {
>>> +      // Do the comparison with if (Str[1] == 'f')
>>> +      // FIXME: Need to escape general characters.
>>> +      OS << Indent << "if (" << StrVariableName << "[" << CharNo
>>> << "] == '"
>>> +         << Matches[0]->first[CharNo] << "') {\n";
>>> +    } else {
>>> +      // Do the comparison with if (Str.substr(1,3) == "foo").
>>> +      OS << Indent << "if (" << StrVariableName << ".substr(" <<
>>> CharNo << ","
>>> +         << NumChars << ") == \"";
>>> +
>>> +      // FIXME: Need to escape general strings.
>>> +      OS << Matches[0]->first.substr(CharNo, NumChars) << "\")  
>>> {\n";
>>> +    }
>>> +
>>> +    EmitStringMatcherForChar(StrVariableName, Matches,
>>> FirstNonCommonLetter,
>>> +                             IndentCount+1, OS);
>>> +    OS << Indent << "}\n";
>>> +    return;
>>> +  }
>>> +
>>> +  // Otherwise, we have multiple possible things, emit a switch on
>>> the
>>> +  // character.
>>> +  OS << Indent << "switch (" << StrVariableName << "[" << CharNo
>>> << "]) {\n";
>>> +  OS << Indent << "default: break;\n";
>>> +
>>> +  for (std::map<char, std::vector<const StringPair*> >::iterator
>>> LI =
>>> +       MatchesByLetter.begin(), E = MatchesByLetter.end(); LI !=
>>> E; ++LI) {
>>> +    // TODO: escape hard stuff (like \n) if we ever care about it.
>>> +    OS << Indent << "case '" << LI->first << "':\t // "
>>> +       << LI->second.size() << " strings to match.\n";
>>> +    EmitStringMatcherForChar(StrVariableName, LI->second, CharNo+1,
>>> +                             IndentCount+1, OS);
>>> +    OS << Indent << "  break;\n";
>>> +  }
>>> +
>>> +  OS << Indent << "}\n";
>>> +
>>> +}
>>> +
>>> +
>>> +/// EmitStringMatcher - Given a list of strings and code to
>>> execute when they
>>> +/// match, output a simple switch tree to classify the input
>>> string.  If a
>>> +/// match is found, the code in Vals[i].second is executed.  This
>>> code should do
>>> +/// a return to avoid falling through.  If nothing matches,
>>> execution falls
>>> +/// through.  StrVariableName is the name of teh variable to test.
>>
>> teh -> the.
>>
>> IMHO, we should just implement this as a (string -> unsigned)  
>> matcher.
>> Thats very frequently the use case, and when it isn't you aren't
>> necessarily worse off by using it as (string -> unsigned -> my  
>> generic
>> code), and you end up with more readable code (instead of  
>> intertwining
>> the generic actions with the matching code).
>>
>> This lets the matcher implement assorted fun optimizations, like:
>>
>> 1. { "0" -> a + b*0, "1" -> n + b*1, ... "9" -> n + b*9} to { "[0-9]"
>> -> a + (char - '0') * b}.
>>
>> 2. { "foo" -> 1, "bar" -> 1, "baz" -> 1, etc -> 0 } into a hash  
>> match.
>>
>>> +static void EmitStringMatcher(const std::string &StrVariableName,
>>> +                              const std::vector<StringPair>
>>> &Matches,
>>> +                              raw_ostream &OS) {
>>> +  // First level categorization: group strings by length.
>>> +  std::map<unsigned, std::vector<const StringPair*> >
>>> MatchesByLength;
>>> +
>>> +  for (unsigned i = 0, e = Matches.size(); i != e; ++i)
>>> +    MatchesByLength[Matches[i].first.size()].push_back(&Matches 
>>> [i]);
>>> +
>>> +  // Output a switch statement on length and categorize the
>>> elements within each
>>> +  // bin.
>>> +  OS << "  switch (" << StrVariableName << ".size()) {\n";
>>> +  OS << "  default: break;\n";
>>> +
>>> +
>>> +  for (std::map<unsigned, std::vector<const StringPair*>
>>>> ::iterator LI =
>>> +       MatchesByLength.begin(), E = MatchesByLength.end(); LI !=
>>> E; ++LI) {
>>> +    OS << "  case " << LI->first << ":\t // " << LI->second.size()
>>> +       << " strings to match.\n";
>>> +    EmitStringMatcherForChar(StrVariableName, LI->second, 0, 0,  
>>> OS);
>>> +    OS << "    break;\n";
>>> +  }
>>> +
>>> +
>>> +  OS << "  }\n";
>>> +}
>>> +
>>> +
>>> +
>>> /// EmitMatchRegisterName - Emit the function to match a string to
>>> the target
>>> /// specific register enum.
>>> static void EmitMatchRegisterName(CodeGenTarget &Target, Record
>>> *AsmParser,
>>> @@ -740,16 +871,20 @@
>>>     << AsmParser->getValueAsString("AsmParserClassName")
>>>     << "::MatchRegisterName(const StringRef &Name, unsigned
>>> &RegNo) {\n";
>>>
>>> +  std::vector<StringPair> Matches;
>>> +
>>>  // FIXME: TableGen should have a fast string matcher generator.
>>
>> - FIXME? :)
>>
>>>  for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
>>>    const CodeGenRegister &Reg = Registers[i];
>>>    if (Reg.TheDef->getValueAsString("AsmName").empty())
>>>      continue;
>>>
>>> -    OS << "  if (Name == \""
>>> -       << Reg.TheDef->getValueAsString("AsmName") << "\")\n"
>>> -       << "    return RegNo=" << i + 1 << ", false;\n";
>>> +    Matches.push_back(StringPair(Reg.TheDef->getValueAsString
>>> ("AsmName"),
>>> +                                 "RegNo=" + utostr(i + 1) + ";
>>> return false;"));
>>>  }
>>> +
>>> +  EmitStringMatcher("Name", Matches, OS);
>>> +
>>>  OS << "  return true;\n";
>>>  OS << "}\n\n";
>>> }
>>
>> Very cool, thanks.
>>
>> - Daniel
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits