[PATCH] [Driver] Convert response files with UTF-16 BOMs to UTF-8

Aaron Ballman aaron at aaronballman.com
Mon Jul 15 05:27:01 PDT 2013


On Mon, Jul 15, 2013 at 8:12 AM, Reid Kleckner <rnk at google.com> wrote:
>     - Check for even byte length utf16 files and don't stack allocate the conversion buffer.
>
> http://llvm-reviews.chandlerc.com/D1137
>
> CHANGE SINCE LAST DIFF
>   http://llvm-reviews.chandlerc.com/D1137?vs=2792&id=2797#toc
>
> Files:
>   test/Driver/at_file.c
>   test/Driver/at_file.c.args.utf16le
>   tools/driver/driver.cpp
>
> Index: test/Driver/at_file.c
> ===================================================================
> --- test/Driver/at_file.c
> +++ test/Driver/at_file.c
> @@ -1,5 +1,7 @@
>  // RUN: %clang -E %s @%s.args -o %t.log
>  // RUN: FileCheck --input-file=%t.log %s
> +// RUN: %clang -E %s @%s.args.utf16le -o %t.log
> +// RUN: FileCheck --input-file=%t.log %s
>
>  // CHECK: bar1
>  // CHECK-NEXT: bar2 zed2
> Index: tools/driver/driver.cpp
> ===================================================================
> --- tools/driver/driver.cpp
> +++ tools/driver/driver.cpp
> @@ -29,6 +29,7 @@
>  #include "llvm/Option/ArgList.h"
>  #include "llvm/Option/OptTable.h"
>  #include "llvm/Option/Option.h"
> +#include "llvm/Support/ConvertUTF.h"
>  #include "llvm/Support/ErrorHandling.h"
>  #include "llvm/Support/FileSystem.h"
>  #include "llvm/Support/Host.h"
> @@ -200,16 +201,63 @@
>    }
>
>    const char *Buf = MemBuf->getBufferStart();
> +  const char *BufEnd = MemBuf->getBufferEnd();
>    char InQuote = ' ';
>    std::string CurArg;
>
> +  // Move this to ConvertUTF.h if needed elsewhere.
> +  enum {
> +    UTF16BOMNative = 0xFEFF,
> +    UTF16BOMByteSwapped = 0xFFFE,
> +  };
> +
> +  UTF16 MaybeBOM = 0;
> +  memcpy(&MaybeBOM, Buf, 2);

Is it possible for the buffer to only contain a single byte?

> +  const UTF16 *Src = 0, *SrcEnd = 0;
> +  std::vector<UTF8> UTF8Buf;
> +
> +  if (MaybeBOM == UTF16BOMByteSwapped ||
> +      (MaybeBOM == UTF16BOMNative && MemBuf->getBufferSize() % 2)) {
> +    // This is a byte-swapped UTF16 BOM, or a native BOM with an uneven file
> +    // length.  We can't issue a diagnostic, so pretend we didn't realize this
> +    // was a response file.
> +    ArgVector.push_back(SaveStringInSet(SavedStrings, Arg));
> +    return;
> +  }
> +
> +  if (MaybeBOM == UTF16BOMNative) {
> +    // Native endianness UTF16 BOM.  Convert a chunk at a time as needed.
> +    Src = reinterpret_cast<const UTF16 *>(Buf) + 1;
> +    SrcEnd = reinterpret_cast<const UTF16 *>(BufEnd);
> +    Buf = 0;
> +    BufEnd = 0;
> +    UTF8Buf.resize(4096);
> +  }
> +
>    for (const char *P = Buf; ; ++P) {
> +    if (Src && P == BufEnd) {
> +      // Convert another chunk of UTF16 to UTF8.
> +      UTF8 *Dst = &UTF8Buf[0];
> +      UTF8 *DstEnd = &UTF8Buf[UTF8Buf.size() - 1] + 1;
> +      ConversionResult CR =
> +          ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
> +      if (CR == sourceExhausted)
> +        break;
> +      if (CR != conversionOK && CR != targetExhausted)
> +        return;  // FIXME: Fail more loudly.
> +      // Dst was updated to be one past the last translated byte.
> +      P = reinterpret_cast<char *>(&UTF8Buf[0]);
> +      BufEnd = reinterpret_cast<char *>(Dst);
> +    }
> +
>      if (*P == '\0' || (isWhitespace(*P) && InQuote == ' ')) {
>        if (!CurArg.empty()) {
>
>          if (CurArg[0] != '@') {
>            ArgVector.push_back(SaveStringInSet(SavedStrings, CurArg));
>          } else {
> +          // FIXME: A response file can refer to itself and cause infinite
> +          // recursion.
>            ExpandArgsFromBuf(CurArg.c_str(), ArgVector, SavedStrings);
>          }
>
> _______________________________________________
> cfe-commits mailing list
> cfe-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits
>



More information about the cfe-commits mailing list