[PATCH] (Part 1/2) non-Unicode response file on Windows: UTF-8 BOM
Yunzhong Gao
Yunzhong_Gao at playstation.sony.com
Fri Jan 23 15:44:39 PST 2015
Hi all,
This is spun off from D7133, based on Rafael's suggestion that the UTF-8 BOM changes should be in its separate patch.
In the process of writing a regression test case, I looked at what was done for UTF-16 BOM, and tried to add a similar unit test for UTF-8 BOM in the same file, but this also means that hasUTF8ByteOrderMark() needs be exposed as an external function instead of a static helper function. I hope that is okay.
- Gao
http://reviews.llvm.org/D7156
Files:
llvm/include/llvm/Support/ConvertUTF.h
llvm/lib/Support/CommandLine.cpp
llvm/lib/Support/ConvertUTFWrapper.cpp
llvm/test/Other/ResponseFile.ll
llvm/unittests/Support/ConvertUTFTest.cpp
Index: llvm/include/llvm/Support/ConvertUTF.h
===================================================================
--- llvm/include/llvm/Support/ConvertUTF.h
+++ llvm/include/llvm/Support/ConvertUTF.h
@@ -243,6 +243,13 @@
bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
/**
+ * Returns true if a blob of text starts with a UTF-8 byte order mark.
+ * UTF-8 BOM is a sequence of bytes on Windows and is not affected by the host
+ * system's endianness.
+ */
+bool hasUTF8ByteOrderMark(ArrayRef<char> SrcBytes);
+
+/**
* Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
*
* \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
Index: llvm/lib/Support/CommandLine.cpp
===================================================================
--- llvm/lib/Support/CommandLine.cpp
+++ llvm/lib/Support/CommandLine.cpp
@@ -674,6 +674,11 @@
return false;
Str = StringRef(UTF8Buf);
}
+ // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove
+ // these bytes before parsing.
+ // Reference: http://en.wikipedia.org/wiki/UTF-8#Byte_order_mark
+ else if (hasUTF8ByteOrderMark(BufRef))
+ Str = StringRef(BufRef.data() + 3, BufRef.size() - 3);
// Tokenize the contents into NewArgv.
Tokenizer(Str, Saver, NewArgv, MarkEOLs);
Index: llvm/lib/Support/ConvertUTFWrapper.cpp
===================================================================
--- llvm/lib/Support/ConvertUTFWrapper.cpp
+++ llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -81,6 +81,13 @@
(S[0] == '\xfe' && S[1] == '\xff')));
}
+// It is called byte order marker but the UTF-8 BOM is actually not affected
+// by the host system's endianness.
+bool hasUTF8ByteOrderMark(ArrayRef<char> S) {
+ return (S.size() >= 3 &&
+ S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf');
+}
+
bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
assert(Out.empty());
Index: llvm/test/Other/ResponseFile.ll
===================================================================
--- llvm/test/Other/ResponseFile.ll
+++ llvm/test/Other/ResponseFile.ll
@@ -6,6 +6,13 @@
; RUN: llvm-as @%t.list2 -o %t.bc
; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
+; When the response file begins with UTF8 BOM sequence, we shall remove them.
+; RUN: echo -e "\xef\xbb\xbf" > %t.list3
+; RUN: echo %s >> %t.list3
+; RUN: echo -e "\xef\xbb\xbf-time-passes @%t.list3" > %t.list4
+; RUN: llvm-as @%t.list4 -o %t.bc
+; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
+
; CHECK: T foobar
define void @foobar() {
Index: llvm/unittests/Support/ConvertUTFTest.cpp
===================================================================
--- llvm/unittests/Support/ConvertUTFTest.cpp
+++ llvm/unittests/Support/ConvertUTFTest.cpp
@@ -66,6 +66,20 @@
EXPECT_FALSE(HasBOM);
}
+TEST(ConvertUTFTest, HasUTF8BOM) {
+ bool HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf", 3));
+ EXPECT_TRUE(HasBOM);
+ HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf ", 4));
+ EXPECT_TRUE(HasBOM);
+ HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef\xbb\xbf\x00asdf", 7));
+ EXPECT_TRUE(HasBOM);
+
+ HasBOM = hasUTF8ByteOrderMark(None);
+ EXPECT_FALSE(HasBOM);
+ HasBOM = hasUTF8ByteOrderMark(makeArrayRef("\xef", 1));
+ EXPECT_FALSE(HasBOM);
+}
+
struct ConvertUTFResultContainer {
ConversionResult ErrorCode;
std::vector<unsigned> UnicodeScalars;
EMAIL PREFERENCES
http://reviews.llvm.org/settings/panel/emailpreferences/
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D7156.18696.patch
Type: text/x-patch
Size: 3413 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150123/8fc5f6da/attachment.bin>
More information about the llvm-commits
mailing list