[PATCH] [Driver] Convert response files with UTF-16 BOMs to UTF-8

Reid Kleckner rnk at google.com
Mon Jul 15 05:12:12 PDT 2013


    - Check for even byte length utf16 files and don't stack allocate the conversion buffer.

http://llvm-reviews.chandlerc.com/D1137

CHANGE SINCE LAST DIFF
  http://llvm-reviews.chandlerc.com/D1137?vs=2792&id=2797#toc

Files:
  test/Driver/at_file.c
  test/Driver/at_file.c.args.utf16le
  tools/driver/driver.cpp

Index: test/Driver/at_file.c
===================================================================
--- test/Driver/at_file.c
+++ test/Driver/at_file.c
@@ -1,5 +1,7 @@
 // RUN: %clang -E %s @%s.args -o %t.log
 // RUN: FileCheck --input-file=%t.log %s
+// RUN: %clang -E %s @%s.args.utf16le -o %t.log
+// RUN: FileCheck --input-file=%t.log %s
 
 // CHECK: bar1
 // CHECK-NEXT: bar2 zed2
Index: tools/driver/driver.cpp
===================================================================
--- tools/driver/driver.cpp
+++ tools/driver/driver.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
 #include "llvm/Option/Option.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -200,16 +201,63 @@
   }
 
   const char *Buf = MemBuf->getBufferStart();
+  const char *BufEnd = MemBuf->getBufferEnd();
   char InQuote = ' ';
   std::string CurArg;
 
+  // Move this to ConvertUTF.h if needed elsewhere.
+  enum {
+    UTF16BOMNative = 0xFEFF,
+    UTF16BOMByteSwapped = 0xFFFE,
+  };
+
+  UTF16 MaybeBOM = 0;
+  memcpy(&MaybeBOM, Buf, 2);
+  const UTF16 *Src = 0, *SrcEnd = 0;
+  std::vector<UTF8> UTF8Buf;
+
+  if (MaybeBOM == UTF16BOMByteSwapped ||
+      (MaybeBOM == UTF16BOMNative && MemBuf->getBufferSize() % 2)) {
+    // This is a byte-swapped UTF16 BOM, or a native BOM with an uneven file
+    // length.  We can't issue a diagnostic, so pretend we didn't realize this
+    // was a response file.
+    ArgVector.push_back(SaveStringInSet(SavedStrings, Arg));
+    return;
+  }
+
+  if (MaybeBOM == UTF16BOMNative) {
+    // Native endianness UTF16 BOM.  Convert a chunk at a time as needed.
+    Src = reinterpret_cast<const UTF16 *>(Buf) + 1;
+    SrcEnd = reinterpret_cast<const UTF16 *>(BufEnd);
+    Buf = 0;
+    BufEnd = 0;
+    UTF8Buf.resize(4096);
+  }
+
   for (const char *P = Buf; ; ++P) {
+    if (Src && P == BufEnd) {
+      // Convert another chunk of UTF16 to UTF8.
+      UTF8 *Dst = &UTF8Buf[0];
+      UTF8 *DstEnd = &UTF8Buf[UTF8Buf.size() - 1] + 1;
+      ConversionResult CR =
+          ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
+      if (CR == sourceExhausted)
+        break;
+      if (CR != conversionOK && CR != targetExhausted)
+        return;  // FIXME: Fail more loudly.
+      // Dst was updated to be one past the last translated byte.
+      P = reinterpret_cast<char *>(&UTF8Buf[0]);
+      BufEnd = reinterpret_cast<char *>(Dst);
+    }
+
     if (*P == '\0' || (isWhitespace(*P) && InQuote == ' ')) {
       if (!CurArg.empty()) {
 
         if (CurArg[0] != '@') {
           ArgVector.push_back(SaveStringInSet(SavedStrings, CurArg));
         } else {
+          // FIXME: A response file can refer to itself and cause infinite
+          // recursion.
           ExpandArgsFromBuf(CurArg.c_str(), ArgVector, SavedStrings);
         }
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D1137.2.patch
Type: text/x-patch
Size: 2944 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20130715/4a274273/attachment.bin>


More information about the cfe-commits mailing list