[llvm] r329468 - Windows needs the current codepage instead of utf8 sometimes
Aaron Smith via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 6 17:32:59 PDT 2018
Author: asmith
Date: Fri Apr 6 17:32:59 2018
New Revision: 329468
URL: http://llvm.org/viewvc/llvm-project?rev=329468&view=rev
Log:
Windows needs the current codepage instead of utf8 sometimes
Llvm-mc (and tools that use Path.inc on Windows) assume that strings are utf-8
encoded, however, this is not always the case. On Windows the default codepage
is not utf-8, so most of the time the strings are not utf-8 encoded.
The lld test 'format-binary-non-ascii' uses llvm-mc with a file with non-ascii
characters in the name which is how this bug was found. The test fails when run
using Python 3 because it uses properly encoded unicode strings (Python 2 actually
ends up using a byte string which is not utf-8 encoded, so the test passes, but
that's separate issue).
Patch by Stella Stamenova!
Modified:
llvm/trunk/lib/Support/Windows/Path.inc
llvm/trunk/lib/Support/Windows/WindowsSupport.h
Modified: llvm/trunk/lib/Support/Windows/Path.inc
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/Windows/Path.inc?rev=329468&r1=329467&r2=329468&view=diff
==============================================================================
--- llvm/trunk/lib/Support/Windows/Path.inc (original)
+++ llvm/trunk/lib/Support/Windows/Path.inc Fri Apr 6 17:32:59 2018
@@ -45,6 +45,7 @@ typedef int errno_t;
using namespace llvm;
using llvm::sys::windows::UTF8ToUTF16;
+using llvm::sys::windows::CurCPToUTF16;
using llvm::sys::windows::UTF16ToUTF8;
using llvm::sys::path::widenPath;
@@ -62,7 +63,7 @@ namespace llvm {
namespace sys {
namespace path {
-// Convert a UTF-8 path to UTF-16. Also, if the absolute equivalent of the
+// Convert a (likely) UTF-8 path to UTF-16. Also, if the absolute equivalent of the
// path is longer than CreateDirectory can tolerate, make it absolute and
// prefixed by '\\?\'.
std::error_code widenPath(const Twine &Path8,
@@ -71,7 +72,7 @@ std::error_code widenPath(const Twine &P
// Several operations would convert Path8 to SmallString; more efficient to
// do it once up front.
- SmallString<128> Path8Str;
+ SmallString<2*MAX_PATH> Path8Str;
Path8.toVector(Path8Str);
// If we made this path absolute, how much longer would it get?
@@ -111,11 +112,17 @@ std::error_code widenPath(const Twine &P
else
llvm::sys::path::append(FullPath, *I);
}
- return UTF8ToUTF16(FullPath, Path16);
+ Path8Str = FullPath;
}
- // Just use the caller's original path.
- return UTF8ToUTF16(Path8Str, Path16);
+ // Path8Str now contains the full path or the original path
+ // If the conversion from UTF8 to UTF16 fails because of ERROR_NO_UNICODE_TRANSLATION,
+ // we also try using the current code page before giving up
+ auto ec = UTF8ToUTF16(Path8Str, Path16);
+ if (ec == mapWindowsError(ERROR_NO_UNICODE_TRANSLATION)) {
+ ec = CurCPToUTF16(Path8Str, Path16);
+ }
+ return ec;
}
} // end namespace path
@@ -1293,23 +1300,26 @@ void system_temp_directory(bool ErasedOn
} // end namespace path
namespace windows {
-std::error_code UTF8ToUTF16(llvm::StringRef utf8,
- llvm::SmallVectorImpl<wchar_t> &utf16) {
- if (!utf8.empty()) {
- int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8.begin(),
- utf8.size(), utf16.begin(), 0);
+std::error_code CodePageToUTF16(unsigned codepage,
+ llvm::StringRef original,
+ llvm::SmallVectorImpl<wchar_t> &utf16) {
+ if (!original.empty()) {
+ int len = ::MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, original.begin(),
+ original.size(), utf16.begin(), 0);
- if (len == 0)
+ if (len == 0) {
return mapWindowsError(::GetLastError());
+ }
utf16.reserve(len + 1);
utf16.set_size(len);
- len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, utf8.begin(),
- utf8.size(), utf16.begin(), utf16.size());
+ len = ::MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, original.begin(),
+ original.size(), utf16.begin(), utf16.size());
- if (len == 0)
+ if (len == 0) {
return mapWindowsError(::GetLastError());
+ }
}
// Make utf16 null terminated.
@@ -1319,32 +1329,44 @@ std::error_code UTF8ToUTF16(llvm::String
return std::error_code();
}
+std::error_code UTF8ToUTF16(llvm::StringRef utf8,
+ llvm::SmallVectorImpl<wchar_t> &utf16) {
+ return CodePageToUTF16(CP_UTF8, utf8, utf16);
+}
+
+std::error_code CurCPToUTF16(llvm::StringRef curcp,
+ llvm::SmallVectorImpl<wchar_t> &utf16) {
+ return CodePageToUTF16(CP_ACP, curcp, utf16);
+}
+
static
std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16,
size_t utf16_len,
- llvm::SmallVectorImpl<char> &utf8) {
+ llvm::SmallVectorImpl<char> &converted) {
if (utf16_len) {
// Get length.
- int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.begin(),
+ int len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, converted.begin(),
0, NULL, NULL);
- if (len == 0)
+ if (len == 0) {
return mapWindowsError(::GetLastError());
+ }
- utf8.reserve(len);
- utf8.set_size(len);
+ converted.reserve(len);
+ converted.set_size(len);
// Now do the actual conversion.
- len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, utf8.data(),
- utf8.size(), NULL, NULL);
+ len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, converted.data(),
+ converted.size(), NULL, NULL);
- if (len == 0)
+ if (len == 0) {
return mapWindowsError(::GetLastError());
+ }
}
- // Make utf8 null terminated.
- utf8.push_back(0);
- utf8.pop_back();
+ // Make the new string null terminated.
+ converted.push_back(0);
+ converted.pop_back();
return std::error_code();
}
@@ -1355,8 +1377,8 @@ std::error_code UTF16ToUTF8(const wchar_
}
std::error_code UTF16ToCurCP(const wchar_t *utf16, size_t utf16_len,
- llvm::SmallVectorImpl<char> &utf8) {
- return UTF16ToCodePage(CP_ACP, utf16, utf16_len, utf8);
+ llvm::SmallVectorImpl<char> &curcp) {
+ return UTF16ToCodePage(CP_ACP, utf16, utf16_len, curcp);
}
} // end namespace windows
Modified: llvm/trunk/lib/Support/Windows/WindowsSupport.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/Windows/WindowsSupport.h?rev=329468&r1=329467&r2=329468&view=diff
==============================================================================
--- llvm/trunk/lib/Support/Windows/WindowsSupport.h (original)
+++ llvm/trunk/lib/Support/Windows/WindowsSupport.h Fri Apr 6 17:32:59 2018
@@ -254,6 +254,8 @@ std::error_code widenPath(const Twine &P
namespace windows {
std::error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
+/// Convert to UTF16 from the current code page used in the system
+std::error_code CurCPToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16);
std::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
SmallVectorImpl<char> &utf8);
/// Convert from UTF16 to the current code page used in the system
More information about the llvm-commits
mailing list