[PATCH] (Part 1/2) non-Unicode response file on Windows: UTF-8 BOM

Fri Jan 23 15:44:39 PST 2015

Hi all,
This is spun off from D7133, based on Rafael's suggestion that the UTF-8 BOM changes should be in its separate patch.
In the process of writing a regression test case, I looked at what was done for UTF-16 BOM, and tried to add a similar unit test for UTF-8 BOM in the same file, but this also means that hasUTF8ByteOrderMark() needs be exposed as an external function instead of a static helper function. I hope that is okay.
- Gao

http://reviews.llvm.org/D7156

Files:
  llvm/include/llvm/Support/ConvertUTF.h
  llvm/lib/Support/CommandLine.cpp
  llvm/lib/Support/ConvertUTFWrapper.cpp
  llvm/test/Other/ResponseFile.ll
  llvm/unittests/Support/ConvertUTFTest.cpp

Index: llvm/include/llvm/Support/ConvertUTF.h
===================================================================

--- llvm/include/llvm/Support/ConvertUTF.h
+++ llvm/include/llvm/Support/ConvertUTF.h
@@ -243,6 +243,13 @@
 bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
 
 /**
+ * Returns true if a blob of text starts with a UTF-8 byte order mark.
+ * UTF-8 BOM is a sequence of bytes on Windows and is not affected by the host
+ * system's endianness.
+ */
+bool hasUTF8ByteOrderMark(ArrayRef<char> SrcBytes);
+
+/**
  * Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
  *
  * \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
Index: llvm/lib/Support/CommandLine.cpp
===================================================================
--- llvm/lib/Support/CommandLine.cpp
+++ llvm/lib/Support/CommandLine.cpp
@@ -674,6 +674,11 @@
       return false;
     Str = StringRef(UTF8Buf);
   }
+  // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove
+  // these bytes before parsing.
+  // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark
+  else if (hasUTF8ByteOrderMark(BufRef))
+    Str = StringRef(BufRef.data() + 3, BufRef.size() - 3);
 
   // Tokenize the contents into NewArgv.
   Tokenizer(Str, Saver, NewArgv, MarkEOLs);
Index: llvm/lib/Support/ConvertUTFWrapper.cpp
===================================================================
--- llvm/lib/Support/ConvertUTFWrapper.cpp
+++ llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -81,6 +81,13 @@
            (S[0] == '\xfe' && S[1] == '\xff')));
 }
 
+// It is called byte order marker but the UTF-8 BOM is actually not affected
+// by the host system's endianness.
+bool hasUTF8ByteOrderMark(ArrayRef<char> S) {
+  return (S.size() >= 3 &&
+          S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf');
+}
+
 bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   assert(Out.empty());
 
Index: llvm/test/Other/ResponseFile.ll
===================================================================
--- llvm/test/Other/ResponseFile.ll
+++ llvm/test/Other/ResponseFile.ll
@@ -6,6 +6,13 @@
 ; RUN: llvm-as @%t.list2 -o %t.bc
 ; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
 
+; When the response file begins with UTF8 BOM sequence, we shall remove them.
+; RUN: echo -e "\xef\xbb\xbf" > %t.list3
+; RUN: echo %s >> %t.list3
+; RUN: echo -e "\xef\xbb\xbf-time-passes @%t.list3" > %t.list4
+; RUN: llvm-as @%t.list4 -o %t.bc
+; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
+
 ; CHECK: T foobar
 
 define void @foobar() {
Index: llvm/unittests/Support/ConvertUTFTest.cpp
===================================================================
--- llvm/unittests/Support/ConvertUTFTest.cpp
+++ llvm/unittests/Support/ConvertUTFTest.cpp
@@ -66,6 +66,20 @@
   EXPECT_FALSE(HasBOM);
 }
 
+TEST(ConvertUTFTest, HasUTF8BOM) {
+  bool HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf", 3));
+  EXPECT_TRUE(HasBOM);
+  HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf ", 4));
+  EXPECT_TRUE(HasBOM);
+  HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf\x00asdf", 7));
+  EXPECT_TRUE(HasBOM);
+
+  HasBOM = hasUTF8ByteOrderMark(None);
+  EXPECT_FALSE(HasBOM);
+  HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef", 1));
+  EXPECT_FALSE(HasBOM);
+}
+
 struct ConvertUTFResultContainer {
   ConversionResult ErrorCode;
   std::vector<unsigned> UnicodeScalars;

EMAIL PREFERENCES
  http://reviews.llvm.org/settings/panel/emailpreferences/
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D7156.18696.patch
Type: text/x-patch
Size: 3413 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150123/8fc5f6da/attachment.bin>