[cfe-commits] r143416 - in /cfe/trunk: include/clang/Basic/DiagnosticLexKinds.td include/clang/Lex/LiteralSupport.h lib/Lex/LiteralSupport.cpp
Eli Friedman
eli.friedman at gmail.com
Mon Oct 31 19:14:51 PDT 2011
Author: efriedma
Date: Mon Oct 31 21:14:50 2011
New Revision: 143416
URL: http://llvm.org/viewvc/llvm-project?rev=143416&view=rev
Log:
Perform proper conversion for strings encoded in the source file as UTF-8. (For now, we are assuming the source character set is always UTF-8; this can be easily extended if necessary.)
Tests will be coming up in a subsequent commit.
Patch by Seth Cantrell.
Modified:
cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
cfe/trunk/include/clang/Lex/LiteralSupport.h
cfe/trunk/lib/Lex/LiteralSupport.cpp
Modified: cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td?rev=143416&r1=143415&r2=143416&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td (original)
+++ cfe/trunk/include/clang/Basic/DiagnosticLexKinds.td Mon Oct 31 21:14:50 2011
@@ -130,6 +130,8 @@
InGroup<CXX98Compat>, DefaultIgnore;
def err_unsupported_string_concat : Error<
"unsupported non-standard concatenation of string literals">;
+def err_bad_string_encoding : Error<
+ "illegal sequence in string literal">;
//===----------------------------------------------------------------------===//
// PTH Diagnostics
Modified: cfe/trunk/include/clang/Lex/LiteralSupport.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/LiteralSupport.h?rev=143416&r1=143415&r2=143416&view=diff
==============================================================================
--- cfe/trunk/include/clang/Lex/LiteralSupport.h (original)
+++ cfe/trunk/include/clang/Lex/LiteralSupport.h Mon Oct 31 21:14:50 2011
@@ -197,7 +197,7 @@
private:
void init(const Token *StringToks, unsigned NumStringToks);
- void CopyStringFragment(StringRef Fragment);
+ bool CopyStringFragment(StringRef Fragment);
};
} // end namespace clang
Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=143416&r1=143415&r2=143416&view=diff
==============================================================================
--- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
+++ cfe/trunk/lib/Lex/LiteralSupport.cpp Mon Oct 31 21:14:50 2011
@@ -16,6 +16,7 @@
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/LexDiagnostic.h"
#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/ConvertUTF.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/ErrorHandling.h"
using namespace clang;
@@ -1033,7 +1034,14 @@
ThisTokEnd -= (ThisTokBuf - Prefix);
// Copy the string over
- CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf));
+ if (CopyStringFragment(StringRef(ThisTokBuf,ThisTokEnd-ThisTokBuf)))
+ {
+ if (Diags)
+ Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+ diag::err_bad_string_encoding);
+ hadError = true;
+ }
+
} else {
assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
++ThisTokBuf; // skip "
@@ -1060,7 +1068,13 @@
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
// Copy the character span over.
- CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart));
+ if (CopyStringFragment(StringRef(InStart,ThisTokBuf-InStart)))
+ {
+ if (Diags)
+ Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+ diag::err_bad_string_encoding);
+ hadError = true;
+ }
continue;
}
// Is this a Universal Character Name escape?
@@ -1116,20 +1130,39 @@
/// copyStringFragment - This function copies from Start to End into ResultPtr.
/// Performs widening for multi-byte characters.
-void StringLiteralParser::CopyStringFragment(StringRef Fragment) {
+bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
+ assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4);
+ ConversionResult result = conversionOK;
// Copy the character span over.
if (CharByteWidth == 1) {
memcpy(ResultPtr, Fragment.data(), Fragment.size());
ResultPtr += Fragment.size();
- } else {
- // Note: our internal rep of wide char tokens is always little-endian.
- for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) {
- *ResultPtr++ = *I;
- // Add zeros at the end.
- for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
- *ResultPtr++ = 0;
- }
- }
+ } else if (CharByteWidth == 2) {
+ UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
+ ConversionFlags flags = lenientConversion;
+ result = ConvertUTF8toUTF16(
+ &sourceStart,sourceStart + Fragment.size(),
+ &targetStart,targetStart + 2*Fragment.size(),flags);
+ if (result==conversionOK)
+ ResultPtr = reinterpret_cast<char*>(targetStart);
+ } else if (CharByteWidth == 4) {
+ UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
+ ConversionFlags flags = lenientConversion;
+ result = ConvertUTF8toUTF32(
+ &sourceStart,sourceStart + Fragment.size(),
+ &targetStart,targetStart + 4*Fragment.size(),flags);
+ if (result==conversionOK)
+ ResultPtr = reinterpret_cast<char*>(targetStart);
+ }
+ assert((result != targetExhausted)
+ && "ConvertUTF8toUTFXX exhausted target buffer");
+ return result != conversionOK;
}
More information about the cfe-commits
mailing list