[llvm] r310621 - Add .rc scripts tokenizer.
Marek Sokolowski via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 10 09:21:44 PDT 2017
Author: mnbvmar
Date: Thu Aug 10 09:21:44 2017
New Revision: 310621
URL: http://llvm.org/viewvc/llvm-project?rev=310621&view=rev
Log:
Add .rc scripts tokenizer.
This extends the shell of llvm-rc tool with the ability of tokenization
of the input files. Currently, ASCII and ASCII-compatible UTF-8 files
are supported.
Thanks to Nico Weber (thakis) for his original work in this area.
Differential Revision: https://reviews.llvm.org/D35957
Added:
llvm/trunk/test/tools/llvm-rc/Inputs/
llvm/trunk/test/tools/llvm-rc/Inputs/tokens.rc
llvm/trunk/test/tools/llvm-rc/tokenizer.test
llvm/trunk/tools/llvm-rc/ResourceScriptToken.cpp
llvm/trunk/tools/llvm-rc/ResourceScriptToken.h
llvm/trunk/tools/llvm-rc/ResourceScriptTokenList.h
Modified:
llvm/trunk/tools/llvm-rc/CMakeLists.txt
llvm/trunk/tools/llvm-rc/llvm-rc.cpp
Added: llvm/trunk/test/tools/llvm-rc/Inputs/tokens.rc
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-rc/Inputs/tokens.rc?rev=310621&view=auto
==============================================================================
--- llvm/trunk/test/tools/llvm-rc/Inputs/tokens.rc (added)
+++ llvm/trunk/test/tools/llvm-rc/Inputs/tokens.rc Thu Aug 10 09:21:44 2017
@@ -0,0 +1,8 @@
+1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
+He11o LLVM
+
+"RC string test.",L"Another RC string test.'&{",42,100
+
+
+
+ ":))"
Added: llvm/trunk/test/tools/llvm-rc/tokenizer.test
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-rc/tokenizer.test?rev=310621&view=auto
==============================================================================
--- llvm/trunk/test/tools/llvm-rc/tokenizer.test (added)
+++ llvm/trunk/test/tools/llvm-rc/tokenizer.test Thu Aug 10 09:21:44 2017
@@ -0,0 +1,35 @@
+; RUN: llvm-rc /V %p/Inputs/tokens.rc | FileCheck %s
+
+; CHECK: Int: 1; int value = 1
+; CHECK-NEXT: Plus: +
+; CHECK-NEXT: Int: 2; int value = 2
+; CHECK-NEXT: Minus: -
+; CHECK-NEXT: Int: 3214L; int value = 3214
+; CHECK-NEXT: Amp: &
+; CHECK-NEXT: Int: 0x120894; int value = 1181844
+; CHECK-NEXT: Int: 032173; int value = 13435
+; CHECK-NEXT: Int: 2; int value = 2
+; CHECK-NEXT: Pipe: |
+; CHECK-NEXT: Amp: &
+; CHECK-NEXT: Tilde: ~
+; CHECK-NEXT: Plus: +
+; CHECK-NEXT: LeftParen: (
+; CHECK-NEXT: Minus: -
+; CHECK-NEXT: Int: 7; int value = 7
+; CHECK-NEXT: RightParen: )
+; CHECK-NEXT: BlockBegin: {
+; CHECK-NEXT: Int: 0xabcdef; int value = 11259375
+; CHECK-NEXT: Int: 0xABCDEFl; int value = 11259375
+; CHECK-NEXT: BlockEnd: }
+; CHECK-NEXT: BlockBegin: Begin
+; CHECK-NEXT: BlockEnd: End
+; CHECK-NEXT: Identifier: He11o
+; CHECK-NEXT: Identifier: LLVM
+; CHECK-NEXT: String: "RC string test."
+; CHECK-NEXT: Comma: ,
+; CHECK-NEXT: String: L"Another RC string test.'&{"
+; CHECK-NEXT: Comma: ,
+; CHECK-NEXT: Int: 42; int value = 42
+; CHECK-NEXT: Comma: ,
+; CHECK-NEXT: Int: 100; int value = 100
+; CHECK-NEXT: String: ":))"
Modified: llvm/trunk/tools/llvm-rc/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/CMakeLists.txt?rev=310621&r1=310620&r2=310621&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-rc/CMakeLists.txt (original)
+++ llvm/trunk/tools/llvm-rc/CMakeLists.txt Thu Aug 10 09:21:44 2017
@@ -10,4 +10,5 @@ add_public_tablegen_target(RcTableGen)
add_llvm_tool(llvm-rc
llvm-rc.cpp
+ ResourceScriptToken.cpp
)
Added: llvm/trunk/tools/llvm-rc/ResourceScriptToken.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/ResourceScriptToken.cpp?rev=310621&view=auto
==============================================================================
--- llvm/trunk/tools/llvm-rc/ResourceScriptToken.cpp (added)
+++ llvm/trunk/tools/llvm-rc/ResourceScriptToken.cpp Thu Aug 10 09:21:44 2017
@@ -0,0 +1,296 @@
+//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file implements an interface defined in ResourceScriptToken.h.
+// In particular, it defines an .rc script tokenizer.
+//
+//===---------------------------------------------------------------------===//
+
+#include "ResourceScriptToken.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <cstdlib>
+#include <utility>
+
+using namespace llvm;
+
+using Kind = RCToken::Kind;
+
+// Checks if Representation is a correct description of an RC integer.
+// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
+// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
+// character (that is the difference between our representation and
+// StringRef's one). If Representation is correct, 'true' is returned and
+// the return value is put back in Num.
+static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
+ size_t Length = Representation.size();
+ if (Length == 0)
+ return false;
+ // Strip the last 'L' if unnecessary.
+ if (std::toupper(Representation.back()) == 'L')
+ Representation = Representation.drop_back(1);
+
+ return !Representation.getAsInteger<uint32_t>(0, Num);
+}
+
+RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
+ : TokenKind(RCTokenKind), TokenValue(Value) {}
+
+uint32_t RCToken::intValue() const {
+ assert(TokenKind == Kind::Int);
+ // We assume that the token already is a correct integer (checked by
+ // rcGetAsInteger).
+ uint32_t Result;
+ bool IsSuccess = rcGetAsInteger(TokenValue, Result);
+ assert(IsSuccess);
+ (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
+ return Result;
+}
+
+StringRef RCToken::value() const { return TokenValue; }
+
+Kind RCToken::kind() const { return TokenKind; }
+
+static Error getStringError(const Twine &message) {
+ return make_error<StringError>("Error parsing file: " + message,
+ inconvertibleErrorCode());
+}
+
+namespace {
+
+class Tokenizer {
+public:
+ Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
+
+ Expected<std::vector<RCToken>> run();
+
+private:
+ // All 'advancing' methods return boolean values; if they're equal to false,
+ // the stream has ended or failed.
+ bool advance(size_t Amount = 1);
+ bool skipWhitespaces();
+
+ // Consumes a token. If any problem occurred, a non-empty Error is returned.
+ Error consumeToken(const Kind TokenKind);
+
+ // Check if tokenizer is about to read FollowingChars.
+ bool willNowRead(StringRef FollowingChars) const;
+
+ // Check if tokenizer can start reading an identifier at current position.
+ // The original tool did non specify the rules to determine what is a correct
+ // identifier. We assume they should follow the C convention:
+ // [a-zA-z_][a-zA-Z0-9_]*.
+ bool canStartIdentifier() const;
+ // Check if tokenizer can continue reading an identifier.
+ bool canContinueIdentifier() const;
+
+ // Check if tokenizer can start reading an integer.
+ // A correct integer always starts with a 0-9 digit,
+ // can contain characters 0-9A-Fa-f (digits),
+ // Ll (marking the integer is 32-bit), Xx (marking the representation
+ // is hexadecimal). As some kind of separator should come after the
+ // integer, we can consume the integer until a non-alphanumeric
+ // character.
+ bool canStartInt() const;
+ bool canContinueInt() const;
+
+ bool canStartString() const;
+
+ bool streamEof() const;
+
+ // Classify the token that is about to be read from the current position.
+ Kind classifyCurrentToken() const;
+
+ // Process the Kind::Identifier token - check if it is
+ // an identifier describing a block start or end.
+ void processIdentifier(RCToken &token) const;
+
+ StringRef Data;
+ size_t DataLength, Pos;
+};
+
+Expected<std::vector<RCToken>> Tokenizer::run() {
+ Pos = 0;
+ std::vector<RCToken> Result;
+
+ // Consume an optional UTF-8 Byte Order Mark.
+ if (willNowRead("\xef\xbb\xbf"))
+ advance(3);
+
+ while (!streamEof()) {
+ if (!skipWhitespaces())
+ break;
+
+ Kind TokenKind = classifyCurrentToken();
+ if (TokenKind == Kind::Invalid)
+ return getStringError("Invalid token found at position " + Twine(Pos));
+
+ const size_t TokenStart = Pos;
+ if (Error TokenError = consumeToken(TokenKind))
+ return std::move(TokenError);
+
+ RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
+ if (TokenKind == Kind::Identifier) {
+ processIdentifier(Token);
+ } else if (TokenKind == Kind::Int) {
+ uint32_t TokenInt;
+ if (!rcGetAsInteger(Token.value(), TokenInt)) {
+ // The integer has incorrect format or cannot be represented in
+ // a 32-bit integer.
+ return getStringError("Integer invalid or too large: " +
+ Token.value().str());
+ }
+ }
+
+ Result.push_back(Token);
+ }
+
+ return Result;
+}
+
+bool Tokenizer::advance(size_t Amount) {
+ Pos += Amount;
+ return !streamEof();
+}
+
+bool Tokenizer::skipWhitespaces() {
+ while (!streamEof() && std::isspace(Data[Pos]))
+ advance();
+ return !streamEof();
+}
+
+Error Tokenizer::consumeToken(const Kind TokenKind) {
+ switch (TokenKind) {
+ // One-character token consumption.
+#define TOKEN(Name)
+#define SHORT_TOKEN(Name, Ch) case Kind::Name:
+#include "ResourceScriptTokenList.h"
+#undef TOKEN
+#undef SHORT_TOKEN
+ advance();
+ return Error::success();
+
+ case Kind::Identifier:
+ while (!streamEof() && canContinueIdentifier())
+ advance();
+ return Error::success();
+
+ case Kind::Int:
+ while (!streamEof() && canContinueInt())
+ advance();
+ return Error::success();
+
+ case Kind::String:
+ // Consume the preceding 'L', if there is any.
+ if (std::toupper(Data[Pos]) == 'L')
+ advance();
+ // Consume the double-quote.
+ advance();
+
+ // Consume the characters until the end of the file, line or string.
+ while (true) {
+ if (streamEof()) {
+ return getStringError("Unterminated string literal.");
+ } else if (Data[Pos] == '"') {
+ // Consume the ending double-quote.
+ advance();
+ return Error::success();
+ } else if (Data[Pos] == '\n') {
+ return getStringError("String literal not terminated in the line.");
+ }
+
+ advance();
+ }
+
+ case Kind::Invalid:
+ assert(false && "Cannot consume an invalid token.");
+ }
+}
+
+bool Tokenizer::willNowRead(StringRef FollowingChars) const {
+ return Data.drop_front(Pos).startswith(FollowingChars);
+}
+
+bool Tokenizer::canStartIdentifier() const {
+ assert(!streamEof());
+
+ const char CurChar = Data[Pos];
+ return std::isalpha(CurChar) || CurChar == '_';
+}
+
+bool Tokenizer::canContinueIdentifier() const {
+ assert(!streamEof());
+ const char CurChar = Data[Pos];
+ return std::isalnum(CurChar) || CurChar == '_';
+}
+
+bool Tokenizer::canStartInt() const {
+ assert(!streamEof());
+ return std::isdigit(Data[Pos]);
+}
+
+bool Tokenizer::canContinueInt() const {
+ assert(!streamEof());
+ return std::isalnum(Data[Pos]);
+}
+
+bool Tokenizer::canStartString() const {
+ return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
+}
+
+bool Tokenizer::streamEof() const { return Pos == DataLength; }
+
+Kind Tokenizer::classifyCurrentToken() const {
+ if (canStartInt())
+ return Kind::Int;
+ if (canStartString())
+ return Kind::String;
+ // BEGIN and END are at this point of lexing recognized as identifiers.
+ if (canStartIdentifier())
+ return Kind::Identifier;
+
+ const char CurChar = Data[Pos];
+
+ switch (CurChar) {
+ // One-character token classification.
+#define TOKEN(Name)
+#define SHORT_TOKEN(Name, Ch) \
+ case Ch: \
+ return Kind::Name;
+#include "ResourceScriptTokenList.h"
+#undef TOKEN
+#undef SHORT_TOKEN
+
+ default:
+ return Kind::Invalid;
+ }
+}
+
+void Tokenizer::processIdentifier(RCToken &Token) const {
+ assert(Token.kind() == Kind::Identifier);
+ StringRef Name = Token.value();
+
+ if (Name.equals_lower("begin"))
+ Token = RCToken(Kind::BlockBegin, Name);
+ else if (Name.equals_lower("end"))
+ Token = RCToken(Kind::BlockEnd, Name);
+}
+
+} // anonymous namespace
+
+namespace llvm {
+
+Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
+ return Tokenizer(Input).run();
+}
+
+} // namespace llvm
Added: llvm/trunk/tools/llvm-rc/ResourceScriptToken.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/ResourceScriptToken.h?rev=310621&view=auto
==============================================================================
--- llvm/trunk/tools/llvm-rc/ResourceScriptToken.h (added)
+++ llvm/trunk/tools/llvm-rc/ResourceScriptToken.h Thu Aug 10 09:21:44 2017
@@ -0,0 +1,81 @@
+//===-- ResourceScriptToken.h -----------------------------------*- C++-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This declares the .rc script tokens and defines an interface for tokenizing
+// the input data. The list of available tokens is located at
+// ResourceScriptTokenList.h.
+//
+// Note that the tokenizer does not support comments or preprocessor
+// directives. The preprocessor should do its work on the .rc file before
+// running llvm-rc.
+//
+// As for now, it is possible to parse ASCII files only (the behavior on
+// UTF files might be undefined). However, it already consumes UTF-8 BOM, if
+// there is any. Thus, ASCII-compatible UTF-8 files are tokenized correctly.
+//
+// Ref: msdn.microsoft.com/en-us/library/windows/desktop/aa380599(v=vs.85).aspx
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
+#define LLVM_TOOLS_LLVMRC_RESOURCESCRIPTTOKEN_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+// A definition of a single resource script token. Each token has its kind
+// (declared in ResourceScriptTokenList) and holds a value - a reference
+// representation of the token.
+// RCToken does not claim ownership on its value. A memory buffer containing
+// the token value should be stored in a safe place and cannot be freed
+// nor reallocated.
+class RCToken {
+public:
+ enum class Kind {
+#define TOKEN(Name) Name,
+#define SHORT_TOKEN(Name, Ch) Name,
+#include "ResourceScriptTokenList.h"
+#undef TOKEN
+#undef SHORT_TOKEN
+ };
+
+ RCToken(RCToken::Kind RCTokenKind, StringRef Value);
+
+ // Get an integer value of the integer token.
+ uint32_t intValue() const;
+
+ StringRef value() const;
+ Kind kind() const;
+
+private:
+ Kind TokenKind;
+ StringRef TokenValue;
+};
+
+// Tokenize Input.
+// In case no error occured, the return value contains
+// tokens in order they were in the input file.
+// In case of any error, the return value contains
+// a textual representation of error.
+//
+// Tokens returned by this function hold only references to the parts
+// of the Input. Memory buffer containing Input cannot be freed,
+// modified or reallocated.
+Expected<std::vector<RCToken>> tokenizeRC(StringRef Input);
+
+} // namespace llvm
+
+#endif
Added: llvm/trunk/tools/llvm-rc/ResourceScriptTokenList.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/ResourceScriptTokenList.h?rev=310621&view=auto
==============================================================================
--- llvm/trunk/tools/llvm-rc/ResourceScriptTokenList.h (added)
+++ llvm/trunk/tools/llvm-rc/ResourceScriptTokenList.h Thu Aug 10 09:21:44 2017
@@ -0,0 +1,35 @@
+//===-- ResourceScriptTokenList.h -------------------------------*- C++-*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This is a part of llvm-rc tokenizer. It lists all the possible tokens
+// that might occur in a correct .rc script.
+//
+//===---------------------------------------------------------------------===//
+
+
+// Long tokens. They might consist of more than one character.
+TOKEN(Invalid) // Invalid token. Should not occur in a valid script.
+TOKEN(Int) // Integer (decimal, octal or hexadecimal).
+TOKEN(String) // String value.
+TOKEN(Identifier) // Script identifier (resource name or type).
+
+// Short tokens. They usually consist of exactly one character.
+// The definitions are of the form SHORT_TOKEN(TokenName, TokenChar).
+// TokenChar is the one-character token representation occuring in the correct
+// .rc scripts.
+SHORT_TOKEN(BlockBegin, '{') // Start of the script block; can also be BEGIN.
+SHORT_TOKEN(BlockEnd, '}') // End of the block; can also be END.
+SHORT_TOKEN(Comma, ',') // Comma - resource arguments separator.
+SHORT_TOKEN(Plus, '+') // Addition operator.
+SHORT_TOKEN(Minus, '-') // Subtraction operator.
+SHORT_TOKEN(Pipe, '|') // Bitwise-OR operator.
+SHORT_TOKEN(Amp, '&') // Bitwise-AND operator.
+SHORT_TOKEN(Tilde, '~') // Bitwise-NOT operator.
+SHORT_TOKEN(LeftParen, '(') // Left parenthesis in the script expressions.
+SHORT_TOKEN(RightParen, ')') // Right parenthesis.
Modified: llvm/trunk/tools/llvm-rc/llvm-rc.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/llvm-rc.cpp?rev=310621&r1=310620&r2=310621&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-rc/llvm-rc.cpp (original)
+++ llvm/trunk/tools/llvm-rc/llvm-rc.cpp Thu Aug 10 09:21:44 2017
@@ -1,4 +1,4 @@
-//===- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*--===//
+//===-- llvm-rc.cpp - Compile .rc scripts into .res -------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,6 +12,8 @@
//
//===----------------------------------------------------------------------===//
+#include "ResourceScriptToken.h"
+
#include "llvm/Option/Arg.h"
#include "llvm/Option/ArgList.h"
#include "llvm/Support/Error.h"
@@ -60,6 +62,12 @@ public:
};
static ExitOnError ExitOnErr;
+
+LLVM_ATTRIBUTE_NORETURN static void fatalError(Twine Message) {
+ errs() << Message << "\n";
+ exit(1);
+}
+
} // anonymous namespace
int main(int argc_, const char *argv_[]) {
@@ -81,8 +89,49 @@ int main(int argc_, const char *argv_[])
opt::InputArgList InputArgs = T.ParseArgs(ArgsArr, MAI, MAC);
// The tool prints nothing when invoked with no command-line arguments.
- if (InputArgs.hasArg(OPT_HELP))
+ if (InputArgs.hasArg(OPT_HELP)) {
T.PrintHelp(outs(), "rc", "Resource Converter", false);
+ return 0;
+ }
+
+ const bool BeVerbose = InputArgs.hasArg(OPT_VERBOSE);
+
+ std::vector<std::string> InArgsInfo = InputArgs.getAllArgValues(OPT_INPUT);
+ if (InArgsInfo.size() != 1) {
+ fatalError("Exactly one input file should be provided.");
+ }
+
+ // Read and tokenize the input file.
+ const Twine &Filename = InArgsInfo[0];
+ ErrorOr<std::unique_ptr<MemoryBuffer>> File = MemoryBuffer::getFile(Filename);
+ if (!File) {
+ fatalError("Error opening file '" + Filename +
+ "': " + File.getError().message());
+ }
+
+ std::unique_ptr<MemoryBuffer> FileContents = std::move(*File);
+ StringRef Contents = FileContents->getBuffer();
+
+ std::vector<RCToken> Tokens = ExitOnErr(tokenizeRC(Contents));
+
+ if (BeVerbose) {
+ const Twine TokenNames[] = {
+#define TOKEN(Name) #Name,
+#define SHORT_TOKEN(Name, Ch) #Name,
+#include "ResourceScriptTokenList.h"
+#undef TOKEN
+#undef SHORT_TOKEN
+ };
+
+ for (const RCToken &Token : Tokens) {
+ outs() << TokenNames[static_cast<int>(Token.kind())] << ": "
+ << Token.value();
+ if (Token.kind() == RCToken::Kind::Int)
+ outs() << "; int value = " << Token.intValue();
+
+ outs() << "\n";
+ }
+ }
return 0;
}
More information about the llvm-commits
mailing list