[llvm] r331391 - [llvm-rc] Add rudimentary support for codepages

Martin Storsjo via llvm-commits llvm-commits at lists.llvm.org
Wed May 2 12:43:44 PDT 2018


Author: mstorsjo
Date: Wed May  2 12:43:44 2018
New Revision: 331391

URL: http://llvm.org/viewvc/llvm-project?rev=331391&view=rev
Log:
[llvm-rc] Add rudimentary support for codepages

Only support UTF-8 (since LLVM contains UTF-8 parsing support
already, and the code even does that already) and Windows-1252
(where most code points has the same value in unicode). Keep the
existing default as only allowing ASCII input.

Using the option type JoinedOrSeparate, since the real rc.exe
handles options in this form, even if llvm-rc uses Separate for
other similar existing options.

Rename the struct SearchParams to WriterParams since it's now used
for more than just include paths.

Add a missing getResourceTypeName method to the BundleResource class,
to fix error printing from within STRINGTABLE resources (used in
tests).

Differential Revision: https://reviews.llvm.org/D46238

Added:
    llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc
    llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc
    llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc
    llvm/trunk/test/tools/llvm-rc/codepage.test
Modified:
    llvm/trunk/test/tools/llvm-rc/helpmsg.test
    llvm/trunk/tools/llvm-rc/Opts.td
    llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp
    llvm/trunk/tools/llvm-rc/ResourceFileWriter.h
    llvm/trunk/tools/llvm-rc/llvm-rc.cpp

Added: llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc?rev=331391&view=auto
==============================================================================
--- llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc (added)
+++ llvm/trunk/test/tools/llvm-rc/Inputs/cp1252.rc Wed May  2 12:43:44 2018
@@ -0,0 +1,4 @@
+STRINGTABLE {
+  1 "åäö © ƒ \xe5\xe4\366 \251 \x83"
+  2 L"åäö © ƒ \xe5\xe4\366 \251 \x0192"
+}

Added: llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc?rev=331391&view=auto
==============================================================================
--- llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc (added)
+++ llvm/trunk/test/tools/llvm-rc/Inputs/utf8-escape-narrow.rc Wed May  2 12:43:44 2018
@@ -0,0 +1,5 @@
+STRINGTABLE {
+  // One can't pass UTF-8 sequences via multiple escaped chars - in narrow
+  // strings in UTF-8 mode, only ASCII chars can be entered via escapes.
+  1 "åäö \xc3\xa5"
+}

Added: llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc?rev=331391&view=auto
==============================================================================
--- llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc (added)
+++ llvm/trunk/test/tools/llvm-rc/Inputs/utf8.rc Wed May  2 12:43:44 2018
@@ -0,0 +1,6 @@
+STRINGTABLE {
+  // One can't pass UTF-8 sequences via multiple escaped chars - in narrow
+  // strings in UTF-8 mode, only ASCII chars can be entered via escapes.
+  1 "åäö © \x61"
+  2 L"åäö © \xe5\xe4\366 \251"
+}

Added: llvm/trunk/test/tools/llvm-rc/codepage.test
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-rc/codepage.test?rev=331391&view=auto
==============================================================================
--- llvm/trunk/test/tools/llvm-rc/codepage.test (added)
+++ llvm/trunk/test/tools/llvm-rc/codepage.test Wed May  2 12:43:44 2018
@@ -0,0 +1,44 @@
+; RUN: llvm-rc /C 65001 /FO %t.utf8.res %p/Inputs/utf8.rc
+; RUN: llvm-readobj %t.utf8.res | FileCheck %s --check-prefix=UTF8
+
+; UTF8:      Resource type (int): 6
+; UTF8-NEXT: Resource name (int): 1
+; UTF8-NEXT: Data version: 0
+; UTF8-NEXT: Memory flags: 0x1030
+; UTF8-NEXT: Language ID: 1033
+; UTF8-NEXT: Version (major): 0
+; UTF8-NEXT: Version (minor): 0
+; UTF8-NEXT: Characteristics: 0
+; UTF8-NEXT: Data size: 68
+; UTF8-NEXT: Data: (
+; UTF8-NEXT:   0000: 00000700 E500E400 F6002000 A9002000  |.......... ... .|
+; UTF8-NEXT:   0010: 61000B00 E500E400 F6002000 A9002000  |a......... ... .|
+; UTF8-NEXT:   0020: E500E400 F6002000 A9000000 00000000  |...... .........|
+; UTF8-NEXT:   0030: 00000000 00000000 00000000 00000000  |................|
+; UTF8-NEXT:   0040: 00000000                             |....|
+; UTF8-NEXT: )
+
+; RUN: not llvm-rc /C 65001 /FO %t.utf8-escape-narrow.res %p/Inputs/utf8-escape-narrow.rc 2>&1 | FileCheck %s --check-prefix UTF8_ESCAPE
+; UTF8_ESCAPE: llvm-rc: Error in STRINGTABLE statement (ID 1):
+; UTF8_ESCAPE-NEXT: Unable to interpret single byte (195) as UTF-8
+
+; RUN: llvm-rc /C 1252 /FO %t.cp1252.res %p/Inputs/cp1252.rc
+; RUN: llvm-readobj %t.cp1252.res | FileCheck %s --check-prefix=CP1252
+
+; CP1252:      Resource type (int): 6
+; CP1252-NEXT: Resource name (int): 1
+; CP1252-NEXT: Data version: 0
+; CP1252-NEXT: Memory flags: 0x1030
+; CP1252-NEXT: Language ID: 1033
+; CP1252-NEXT: Version (major): 0
+; CP1252-NEXT: Version (minor): 0
+; CP1252-NEXT: Characteristics: 0
+; CP1252-NEXT: Data size: 92
+; CP1252-NEXT: Data: (
+; CP1252-NEXT:   0000: 00000F00 E500E400 F6002000 A9002000  |.......... ... .|
+; CP1252-NEXT:   0010: 92012000 E500E400 F6002000 A9002000  |.. ....... ... .|
+; CP1252-NEXT:   0020: 92010F00 E500E400 F6002000 A9002000  |.......... ... .|
+; CP1252-NEXT:   0030: 92012000 E500E400 F6002000 A9002000  |.. ....... ... .|
+; CP1252-NEXT:   0040: 92010000 00000000 00000000 00000000  |................|
+; CP1252-NEXT:   0050: 00000000 00000000 00000000           |............|
+; CP1252-NEXT: )

Modified: llvm/trunk/test/tools/llvm-rc/helpmsg.test
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/tools/llvm-rc/helpmsg.test?rev=331391&r1=331390&r2=331391&view=diff
==============================================================================
--- llvm/trunk/test/tools/llvm-rc/helpmsg.test (original)
+++ llvm/trunk/test/tools/llvm-rc/helpmsg.test Wed May  2 12:43:44 2018
@@ -7,6 +7,7 @@
 ; CHECK-DAG:  USAGE: rc [options] <inputs>
 ; CHECK-DAG:  OPTIONS:
 ; CHECK-NEXT:    /?          Display this help and exit.
+; CHECK-NEXT:    /C <value>  Set the codepage used for input strings.
 ; CHECK-NEXT:    /dry-run    Don't compile the input; only try to parse it.
 ; CHECK-NEXT:    /D <value>  Define a symbol for the C preprocessor.
 ; CHECK-NEXT:    /FO <value> Change the output file location.

Modified: llvm/trunk/tools/llvm-rc/Opts.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/Opts.td?rev=331391&r1=331390&r2=331391&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-rc/Opts.td (original)
+++ llvm/trunk/tools/llvm-rc/Opts.td Wed May  2 12:43:44 2018
@@ -35,6 +35,9 @@ def H : Flag<[ "/", "-" ], "H">,
 def DRY_RUN : Flag<[ "/", "-" ], "dry-run">,
               HelpText<"Don't compile the input; only try to parse it.">;
 
+def CODEPAGE : JoinedOrSeparate<[ "/", "-" ], "C">,
+               HelpText<"Set the codepage used for input strings.">;
+
 // Unused switches (at least for now). These will stay unimplemented
 // in an early stage of development and can be ignored. However, we need to
 // parse them in order to preserve the compatibility with the original tool.
@@ -44,7 +47,6 @@ def R : Flag<[ "/", "-" ], "R">;
 def SL : Flag<[ "/", "-" ], "SL">;
 
 // (Codepages support.)
-def C : Flag<[ "/", "-" ], "C">;
 def W : Flag<[ "/", "-" ], "W">;
 
 // (Support of MUI and similar.)

Modified: llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp?rev=331391&r1=331390&r2=331391&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp (original)
+++ llvm/trunk/tools/llvm-rc/ResourceFileWriter.cpp Wed May  2 12:43:44 2018
@@ -110,6 +110,18 @@ static bool stripQuotes(StringRef &Str,
   return true;
 }
 
+static UTF16 cp1252ToUnicode(unsigned char C) {
+  static const UTF16 Map80[] = {
+      0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
+      0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
+      0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
+      0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178,
+  };
+  if (C >= 0x80 && C <= 0x9F)
+    return Map80[C - 0x80];
+  return C;
+}
+
 // Describes a way to handle '\0' characters when processing the string.
 // rc.exe tool sometimes behaves in a weird way in postprocessing.
 // If the string to be output is equivalent to a C-string (e.g. in MENU
@@ -132,10 +144,26 @@ enum class NullHandlingMethod {
 //   * Replace the escape sequences with their processed version.
 // For identifiers, this is no-op.
 static Error processString(StringRef Str, NullHandlingMethod NullHandler,
-                           bool &IsLongString, SmallVectorImpl<UTF16> &Result) {
+                           bool &IsLongString, SmallVectorImpl<UTF16> &Result,
+                           int CodePage) {
   bool IsString = stripQuotes(Str, IsLongString);
   SmallVector<UTF16, 128> Chars;
-  convertUTF8ToUTF16String(Str, Chars);
+
+  // Convert the input bytes according to the chosen codepage.
+  if (CodePage == CpUtf8) {
+    convertUTF8ToUTF16String(Str, Chars);
+  } else if (CodePage == CpWin1252) {
+    for (char C : Str)
+      Chars.push_back(cp1252ToUnicode((unsigned char)C));
+  } else {
+    // For other, unknown codepages, only allow plain ASCII input.
+    for (char C : Str) {
+      if ((unsigned char)C > 0x7F)
+        return createError("Non-ASCII 8-bit codepoint (" + Twine(C) +
+                           ") can't be interpreted in the current codepage");
+      Chars.push_back((unsigned char)C);
+    }
+  }
 
   if (!IsString) {
     // It's an identifier if it's not a string. Make all characters uppercase.
@@ -157,21 +185,35 @@ static Error processString(StringRef Str
         if (Char > 0xFF)
           return createError("Non-8-bit codepoint (" + Twine(Char) +
                              ") can't occur in a user-defined narrow string");
+      }
+    }
 
+    Result.push_back(Char);
+    return Error::success();
+  };
+  auto AddEscapedChar = [AddRes, IsLongString, CodePage](UTF16 Char) -> Error {
+    if (!IsLongString) {
+      // Escaped chars in narrow strings have to be interpreted according to
+      // the chosen code page.
+      if (Char > 0xFF)
+        return createError("Non-8-bit escaped char (" + Twine(Char) +
+                           ") can't occur in narrow string");
+      if (CodePage == CpUtf8) {
+        if (Char >= 0x80)
+          return createError("Unable to interpret single byte (" + Twine(Char) +
+                             ") as UTF-8");
+      } else if (CodePage == CpWin1252) {
+        Char = cp1252ToUnicode(Char);
       } else {
-        // In case of narrow non-user strings, Windows RC converts
-        // [0x80, 0xFF] chars according to the current codepage.
-        // There is no 'codepage' concept settled in every supported platform,
-        // so we should reject such inputs.
-        if (Char > 0x7F && Char <= 0xFF)
+        // Unknown/unsupported codepage, only allow ASCII input.
+        if (Char > 0x7F)
           return createError("Non-ASCII 8-bit codepoint (" + Twine(Char) +
                              ") can't "
                              "occur in a non-Unicode string");
       }
     }
 
-    Result.push_back(Char);
-    return Error::success();
+    return AddRes(Char);
   };
 
   while (Pos < Chars.size()) {
@@ -223,7 +265,7 @@ static Error processString(StringRef Str
           --RemainingChars;
         }
 
-        RETURN_IF_ERROR(AddRes(ReadInt));
+        RETURN_IF_ERROR(AddEscapedChar(ReadInt));
         continue;
       }
 
@@ -240,7 +282,7 @@ static Error processString(StringRef Str
           ++Pos;
         }
 
-        RETURN_IF_ERROR(AddRes(ReadInt));
+        RETURN_IF_ERROR(AddEscapedChar(ReadInt));
 
         continue;
       }
@@ -328,7 +370,8 @@ Error ResourceFileWriter::writeCString(S
   SmallVector<UTF16, 128> ProcessedString;
   bool IsLongString;
   RETURN_IF_ERROR(processString(Str, NullHandlingMethod::CutAtNull,
-                                IsLongString, ProcessedString));
+                                IsLongString, ProcessedString,
+                                Params.CodePage));
   for (auto Ch : ProcessedString)
     writeInt<uint16_t>(Ch);
   if (WriteTerminator)
@@ -1142,6 +1185,7 @@ public:
   static bool classof(const RCResource *Res) {
     return Res->getKind() == RkStringTableBundle;
   }
+  Twine getResourceTypeName() const override { return "STRINGTABLE"; }
 };
 
 Error ResourceFileWriter::visitStringTableBundle(const RCResource *Res) {
@@ -1168,7 +1212,7 @@ Error ResourceFileWriter::writeStringTab
     SmallVector<UTF16, 128> Data;
     RETURN_IF_ERROR(processString(Res->Bundle.Data[ID].getValueOr(StringRef()),
                                   NullHandlingMethod::CutAtDoubleNull,
-                                  IsLongString, Data));
+                                  IsLongString, Data, Params.CodePage));
     if (AppendNull && Res->Bundle.Data[ID])
       Data.push_back('\0');
     RETURN_IF_ERROR(
@@ -1215,9 +1259,9 @@ Error ResourceFileWriter::writeUserDefin
 
     SmallVector<UTF16, 128> ProcessedString;
     bool IsLongString;
-    RETURN_IF_ERROR(processString(Elem.getString(),
-                                  NullHandlingMethod::UserResource,
-                                  IsLongString, ProcessedString));
+    RETURN_IF_ERROR(
+        processString(Elem.getString(), NullHandlingMethod::UserResource,
+                      IsLongString, ProcessedString, Params.CodePage));
 
     for (auto Ch : ProcessedString) {
       if (IsLongString) {

Modified: llvm/trunk/tools/llvm-rc/ResourceFileWriter.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/ResourceFileWriter.h?rev=331391&r1=331390&r2=331391&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-rc/ResourceFileWriter.h (original)
+++ llvm/trunk/tools/llvm-rc/ResourceFileWriter.h Wed May  2 12:43:44 2018
@@ -25,15 +25,25 @@ class MemoryBuffer;
 
 namespace rc {
 
-struct SearchParams {
+enum CodePage {
+  CpAcp = 0,        // The current used codepage. Since there's no such
+                    // notion in LLVM what codepage it actually means,
+                    // this only allows ASCII.
+  CpWin1252 = 1252, // A codepage where most 8 bit values correspond to
+                    // unicode code points with the same value.
+  CpUtf8 = 65001,   // UTF-8.
+};
+
+struct WriterParams {
   std::vector<std::string> Include;   // Additional folders to search for files.
   std::vector<std::string> NoInclude; // Folders to exclude from file search.
   StringRef InputFilePath;            // The full path of the input file.
+  int CodePage = CpAcp;               // The codepage for interpreting characters.
 };
 
 class ResourceFileWriter : public Visitor {
 public:
-  ResourceFileWriter(const SearchParams &Params,
+  ResourceFileWriter(const WriterParams &Params,
                      std::unique_ptr<raw_fd_ostream> Stream)
       : Params(Params), FS(std::move(Stream)), IconCursorID(1) {
     assert(FS && "Output stream needs to be provided to the serializator");
@@ -146,7 +156,7 @@ private:
   Error writeVersionInfoBlock(const VersionInfoBlock &);
   Error writeVersionInfoValue(const VersionInfoValue &);
 
-  const SearchParams &Params;
+  const WriterParams &Params;
 
   // Output stream handling.
   std::unique_ptr<raw_fd_ostream> FS;

Modified: llvm/trunk/tools/llvm-rc/llvm-rc.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-rc/llvm-rc.cpp?rev=331391&r1=331390&r2=331391&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-rc/llvm-rc.cpp (original)
+++ llvm/trunk/tools/llvm-rc/llvm-rc.cpp Wed May  2 12:43:44 2018
@@ -129,13 +129,29 @@ int main(int Argc, const char **Argv) {
     }
   }
 
-  SearchParams Params;
+  WriterParams Params;
   SmallString<128> InputFile(InArgsInfo[0]);
   llvm::sys::fs::make_absolute(InputFile);
   Params.InputFilePath = InputFile;
   Params.Include = InputArgs.getAllArgValues(OPT_INCLUDE);
   Params.NoInclude = InputArgs.getAllArgValues(OPT_NOINCLUDE);
 
+  if (InputArgs.hasArg(OPT_CODEPAGE)) {
+    if (InputArgs.getLastArgValue(OPT_CODEPAGE)
+            .getAsInteger(10, Params.CodePage))
+      fatalError("Invalid code page: " +
+                 InputArgs.getLastArgValue(OPT_CODEPAGE));
+    switch (Params.CodePage) {
+    case CpAcp:
+    case CpWin1252:
+    case CpUtf8:
+      break;
+    default:
+      fatalError(
+          "Unsupported code page, only 0, 1252 and 65001 are supported!");
+    }
+  }
+
   std::unique_ptr<ResourceFileWriter> Visitor;
   bool IsDryRun = InputArgs.hasArg(OPT_DRY_RUN);
 




More information about the llvm-commits mailing list