[PATCH] [Driver] Convert response files with UTF-16 BOMs to UTF-8
Nico Weber
thakis at chromium.org
Sat Jul 13 21:21:09 PDT 2013
On Fri, Jul 12, 2013 at 5:28 PM, Reid Kleckner <rnk at google.com> wrote:
> MSBuild writes response files as UTF-16 little endian with a byte order
> mark (0xFEFF). With this change, clang will be able to read them,
> although we
> still can't parse any of their flags. BOMs with non-native endianness are
> recognized as 0xFFFE. Byteswapping them is TODO.
>
> http://llvm-reviews.chandlerc.com/D1137
>
> Files:
> test/Driver/at_file.c
> test/Driver/at_file.c.args.utf16le
> tools/driver/driver.cpp
>
> Index: test/Driver/at_file.c
> ===================================================================
> --- test/Driver/at_file.c
> +++ test/Driver/at_file.c
> @@ -1,5 +1,7 @@
> // RUN: %clang -E %s @%s.args -o %t.log
> // RUN: FileCheck --input-file=%t.log %s
> +// RUN: %clang -E %s @%s.args.utf16le -o %t.log
> +// RUN: FileCheck --input-file=%t.log %s
>
> // CHECK: bar1
> // CHECK-NEXT: bar2 zed2
> Index: tools/driver/driver.cpp
> ===================================================================
> --- tools/driver/driver.cpp
> +++ tools/driver/driver.cpp
> @@ -25,9 +25,11 @@
> #include "llvm/ADT/OwningPtr.h"
> #include "llvm/ADT/SmallString.h"
> #include "llvm/ADT/SmallVector.h"
> +#include "llvm/ADT/STLExtras.h"
> #include "llvm/Option/ArgList.h"
> #include "llvm/Option/OptTable.h"
> #include "llvm/Option/Option.h"
> +#include "llvm/Support/ConvertUTF.h"
> #include "llvm/Support/ErrorHandling.h"
> #include "llvm/Support/FileSystem.h"
> #include "llvm/Support/Host.h"
> @@ -199,16 +201,51 @@
> }
>
> const char *Buf = MemBuf->getBufferStart();
> + const char *BufEnd = MemBuf->getBufferEnd() + 1;
>
char InQuote = ' ';
> std::string CurArg;
>
> + UTF16 MaybeBOM = 0;
> + memcpy(&MaybeBOM, Buf, 2);
> + const UTF16 *Src = 0, *SrcEnd = 0;
> + UTF8 UTF8Buf[4096];
> +
> + if (MaybeBOM == 0xFFFE) {
> + // Byte-swapped endianness UTF16 BOM. We can't issue a diagnostic, so
> + // pretend we didn't realize this was a response file.
> + ArgVector.push_back(SaveStringInSet(SavedStrings, Arg));
> + return;
> + } else if (MaybeBOM == 0xFEFF) {
> + // Native endianness UTF16 BOM. Convert a chunk at a time as needed.
> + Src = reinterpret_cast<const UTF16 *>(Buf) + 1;
> + SrcEnd = reinterpret_cast<const UTF16 *>(BufEnd);
>
Is BufEnd guaranteed to be aligned on a UTF16* boundary here?
> + Buf = 0;
> + BufEnd = 0;
> + }
> +
> for (const char *P = Buf; ; ++P) {
> + if (P == BufEnd) {
> + assert(Src);
> + UTF8 *Dst = &UTF8Buf[0];
> + ConversionResult CR = ConvertUTF16toUTF8(
> + &Src, SrcEnd, &Dst, llvm::array_endof(UTF8Buf),
> strictConversion);
> + if (CR == sourceExhausted)
> + break;
> + if (CR != conversionOK && CR != targetExhausted)
> + return; // FIXME: Fail more loudly.
> + // Dst was updated to be one past the last translated byte.
> + P = reinterpret_cast<char *>(UTF8Buf);
> + BufEnd = reinterpret_cast<char *>(Dst);
> + }
> +
> if (*P == '\0' || (isWhitespace(*P) && InQuote == ' ')) {
> if (!CurArg.empty()) {
>
> if (CurArg[0] != '@') {
> ArgVector.push_back(SaveStringInSet(SavedStrings, CurArg));
> } else {
> + // FIXME: A response file can refer to itself and cause infinite
> + // recursion.
> ExpandArgsFromBuf(CurArg.c_str(), ArgVector, SavedStrings);
> }
>
> _______________________________________________
> cfe-commits mailing list
> cfe-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-commits/attachments/20130713/f9f81062/attachment.html>
More information about the cfe-commits
mailing list