[lld] r295225 - Apply different tokenization rules to linker script expressions.

Wed Feb 15 11:58:18 PST 2017

Author: ruiu
Date: Wed Feb 15 13:58:17 2017
New Revision: 295225

URL: http://llvm.org/viewvc/llvm-project?rev=295225&view=rev
Log:
Apply different tokenization rules to linker script expressions.

The linker script lexer is context-sensitive. In the regular context,
arithmetic operator characters are regular characters, but in the
expression context, they are independent tokens. This afects how the
lexer tokenizes "3*4", for example. (This kind of expression is real;
the Linux kernel uses it.)

This patch defines function `maybeSplitExpr`. This function splits the
current token into multiple expression tokens if the lexer is in the
expression context.

Differential Revision: https://reviews.llvm.org/D29963

Modified:
    lld/trunk/ELF/LinkerScript.cpp
    lld/trunk/ELF/ScriptLexer.cpp
    lld/trunk/ELF/ScriptLexer.h
    lld/trunk/test/ELF/linkerscript/operators.s

Modified: lld/trunk/ELF/LinkerScript.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/LinkerScript.cpp?rev=295225&r1=295224&r2=295225&view=diff
==============================================================================

--- lld/trunk/ELF/LinkerScript.cpp (original)
+++ lld/trunk/ELF/LinkerScript.cpp Wed Feb 15 13:58:17 2017
@@ -1610,9 +1610,7 @@ SymbolAssignment *ScriptParser::readAssi
   Expr E;
   assert(Op == "=" || Op == "+=");
   if (consume("ABSOLUTE")) {
-    // The RHS may be something like "ABSOLUTE(.) & 0xff".
-    // Call readExpr1 to read the whole expression.
-    E = readExpr1(readParenExpr(), 0);
+    E = readExpr();
     E.IsAbsolute = [] { return true; };
   } else {
     E = readExpr();
@@ -1628,7 +1626,15 @@ SymbolAssignment *ScriptParser::readAssi
 
 // This is an operator-precedence parser to parse a linker
 // script expression.
-Expr ScriptParser::readExpr() { return readExpr1(readPrimary(), 0); }
+Expr ScriptParser::readExpr() {
+  // Our lexer is context-aware. Set the in-expression bit so that
+  // they apply different tokenization rules.
+  bool Orig = InExpr;
+  InExpr = true;
+  Expr E = readExpr1(readPrimary(), 0);
+  InExpr = Orig;
+  return E;
+}
 
 static Expr combine(StringRef Op, Expr L, Expr R) {
   auto IsAbs = [=] { return L.IsAbsolute() && R.IsAbsolute(); };

Modified: lld/trunk/ELF/ScriptLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/ScriptLexer.cpp?rev=295225&r1=295224&r2=295225&view=diff
==============================================================================
--- lld/trunk/ELF/ScriptLexer.cpp (original)
+++ lld/trunk/ELF/ScriptLexer.cpp Wed Feb 15 13:58:17 2017
@@ -26,18 +26,9 @@
 // lookahead is labels in version scripts, where we need to parse "local :"
 // as if "local:".
 //
-// Overall, this lexer works fine for most linker scripts. There's room
-// for improving compatibility, but that's probably not at the top of our
-// todo list.
-//
-// A caveat: This lexer splits an input string into tokens ahead of time,
-// so the lexer is not context aware. There's one known corner case. Let's
-// say the next string is "val*3" (without quotes). In the context where
-// the parser is expecting an expression, that should be tokenizes to
-// "val", "*" and "3". In other context, it should be just a single
-// token. (If it is in a filename context, it'll be interpeted as a glob
-// pattern, for example.)  We want to fix this, but it probably needs a
-// redesign of this lexer.
+// Overall, this lexer works fine for most linker scripts. There might
+// be room for improving compatibility, but that's probably not at the
+// top of our todo list.
 //
 //===----------------------------------------------------------------------===//
 
@@ -175,7 +166,60 @@ StringRef ScriptLexer::skipSpace(StringR
 // An erroneous token is handled as if it were the last token before EOF.
 bool ScriptLexer::atEOF() { return Error || Tokens.size() == Pos; }
 
+// Split a given string as an expression.
+// This function returns "3", "*" and "5" for "3*5" for example.
+static std::vector<StringRef> tokenizeExpr(StringRef S) {
+  StringRef Ops = "+-*/"; // List of operators
+
+  // Quoted strings are literal strings, so we don't want to split it.
+  if (S.startswith("\""))
+    return {S};
+
+  // Split S with +-*/ as separators.
+  std::vector<StringRef> Ret;
+  while (!S.empty()) {
+    size_t E = S.find_first_of(Ops);
+
+    // No need to split if there is no operator.
+    if (E == StringRef::npos) {
+      Ret.push_back(S);
+      break;
+    }
+
+    // Get a token before the opreator.
+    if (E != 0)
+      Ret.push_back(S.substr(0, E));
+
+    // Get the operator as a token.
+    Ret.push_back(S.substr(E, 1));
+    S = S.substr(E + 1);
+  }
+  return Ret;
+}
+
+// In contexts where expressions are expected, the lexer should apply
+// different tokenization rules than the default one. By default,
+// arithmetic operator characters are regular characters, but in the
+// expression context, they should be independent tokens.
+//
+// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
+// in the expression context.
+//
+// This function may split the current token into multiple tokens.
+void ScriptLexer::maybeSplitExpr() {
+  if (!InExpr || Error || atEOF())
+    return;
+
+  std::vector<StringRef> V = tokenizeExpr(Tokens[Pos]);
+  if (V.size() == 1)
+    return;
+  Tokens.erase(Tokens.begin() + Pos);
+  Tokens.insert(Tokens.begin() + Pos, V.begin(), V.end());
+}
+
 StringRef ScriptLexer::next() {
+  maybeSplitExpr();
+
   if (Error)
     return "";
   if (atEOF()) {

Modified: lld/trunk/ELF/ScriptLexer.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/ScriptLexer.h?rev=295225&r1=295224&r2=295225&view=diff
==============================================================================
--- lld/trunk/ELF/ScriptLexer.h (original)
+++ lld/trunk/ELF/ScriptLexer.h Wed Feb 15 13:58:17 2017
@@ -36,10 +36,12 @@ public:
 
   std::vector<MemoryBufferRef> MBs;
   std::vector<StringRef> Tokens;
+  bool InExpr = false;
   size_t Pos = 0;
   bool Error = false;
 
 private:
+  void maybeSplitExpr();
   StringRef getLine();
   size_t getLineNumber();
   size_t getColumnNumber();

Modified: lld/trunk/test/ELF/linkerscript/operators.s
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/test/ELF/linkerscript/operators.s?rev=295225&r1=295224&r2=295225&view=diff
==============================================================================
--- lld/trunk/test/ELF/linkerscript/operators.s (original)
+++ lld/trunk/test/ELF/linkerscript/operators.s Wed Feb 15 13:58:17 2017
@@ -5,6 +5,7 @@
 # RUN:  minus = 5 - 1; \
 # RUN:  div = 6 / 2; \
 # RUN:  mul = 1 + 2 * 3; \
+# RUN:  nospace = 1+2*6/3; \
 # RUN:  braces = 1 + (2 + 3) * 4; \
 # RUN:  and = 0xbb & 0xee; \
 # RUN:  ternary1 = 1 ? 1 : 2; \
@@ -32,6 +33,7 @@
 # CHECK: 00000000000004 *ABS* 00000000 minus
 # CHECK: 00000000000003 *ABS* 00000000 div
 # CHECK: 00000000000007 *ABS* 00000000 mul
+# CHECK: 00000000000005 *ABS* 00000000 nospace
 # CHECK: 00000000000015 *ABS* 00000000 braces
 # CHECK: 000000000000aa *ABS* 00000000 and
 # CHECK: 00000000000001 *ABS* 00000000 ternary1