[cfe-commits] r152344 - in /cfe/trunk: include/clang/Basic/ConvertUTF.h lib/Basic/ConvertUTF.c lib/Lex/LiteralSupport.cpp test/Lexer/cxx0x_raw_string_delim_length.cpp test/Lexer/string-literal-encoding.c

Thu Mar 8 14:29:29 PST 2012

On Thu, Mar 8, 2012 at 1:59 PM, Richard Smith
<richard-llvm at metafoo.co.uk> wrote:
> Author: rsmith
> Date: Thu Mar  8 15:59:28 2012
> New Revision: 152344
>
> URL: http://llvm.org/viewvc/llvm-project?rev=152344&view=rev
> Log:
> When checking the encoding of an 8-bit string literal, don't just check the
> first codepoint! Also, don't reject empty raw string literals for spurious
> "encoding" issues. Also, don't rely on undefined behavior in ConvertUTF.c.
>
> Modified:
>    cfe/trunk/include/clang/Basic/ConvertUTF.h
>    cfe/trunk/lib/Basic/ConvertUTF.c
>    cfe/trunk/lib/Lex/LiteralSupport.cpp
>    cfe/trunk/test/Lexer/cxx0x_raw_string_delim_length.cpp
>    cfe/trunk/test/Lexer/string-literal-encoding.c
>
> Modified: cfe/trunk/include/clang/Basic/ConvertUTF.h
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/ConvertUTF.h?rev=152344&r1=152343&r2=152344&view=diff
> ==============================================================================
> --- cfe/trunk/include/clang/Basic/ConvertUTF.h (original)
> +++ cfe/trunk/include/clang/Basic/ConvertUTF.h Thu Mar  8 15:59:28 2012
> @@ -151,9 +151,11 @@
>  ConversionResult ConvertUTF32toUTF16 (
>   const UTF32** sourceStart, const UTF32* sourceEnd,
>   UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
> -#endif
>
>  Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
> +#endif
> +
> +Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd);
>
>  #ifdef __cplusplus
>  }
>
> Modified: cfe/trunk/lib/Basic/ConvertUTF.c
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/ConvertUTF.c?rev=152344&r1=152343&r2=152344&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Basic/ConvertUTF.c (original)
> +++ cfe/trunk/lib/Basic/ConvertUTF.c Thu Mar  8 15:59:28 2012
> @@ -387,7 +387,7 @@
>  */
>  Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
>     int length = trailingBytesForUTF8[*source]+1;
> -    if (source+length > sourceEnd) {
> +    if (length > sourceEnd - source) {
>         return false;
>     }
>     return isLegalUTF8(source, length);
> @@ -395,6 +395,22 @@
>
>  /* --------------------------------------------------------------------- */
>
> +/*
> + * Exported function to return whether a UTF-8 string is legal or not.
> + * This is not used here; it's just exported.
> + */
> +Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) {
> +    while (source != sourceEnd) {
> +        int length = trailingBytesForUTF8[*source] + 1;
> +        if (length > sourceEnd - source || !isLegalUTF8(source, length))
> +            return false;
> +        source += length;
> +    }
> +    return true;
> +}
> +
> +/* --------------------------------------------------------------------- */
> +
>  ConversionResult ConvertUTF8toUTF16 (
>         const UTF8** sourceStart, const UTF8* sourceEnd,
>         UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
> @@ -404,7 +420,7 @@
>     while (source < sourceEnd) {
>         UTF32 ch = 0;
>         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
> -        if (source + extraBytesToRead >= sourceEnd) {
> +        if (extraBytesToRead >= sourceEnd - source) {
>             result = sourceExhausted; break;
>         }
>         /* Do this check whether lenient or strict */
> @@ -477,7 +493,7 @@
>     while (source < sourceEnd) {
>         UTF32 ch = 0;
>         unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
> -        if (source + extraBytesToRead >= sourceEnd) {
> +        if (extraBytesToRead >= sourceEnd - source) {
>             result = sourceExhausted; break;
>         }
>         /* Do this check whether lenient or strict */
>
> Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=152344&r1=152343&r2=152344&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
> +++ cfe/trunk/lib/Lex/LiteralSupport.cpp Thu Mar  8 15:59:28 2012
> @@ -333,7 +333,7 @@
>  ///         decimal-constant integer-suffix
>  ///         octal-constant integer-suffix
>  ///         hexadecimal-constant integer-suffix
> -///       user-defiend-integer-literal: [C++11 lex.ext]
> +///       user-defined-integer-literal: [C++11 lex.ext]
>  ///         decimal-literal ud-suffix
>  ///         octal-literal ud-suffix
>  ///         hexadecimal-literal ud-suffix
> @@ -1167,17 +1167,14 @@
>         ++ThisTokBuf;
>       ++ThisTokBuf; // skip '('
>
> -      // remove same number of characters from the end
> -      if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix))
> -        ThisTokEnd -= (ThisTokBuf - Prefix);
> +      // Remove same number of characters from the end
> +      ThisTokEnd -= ThisTokBuf - Prefix;
> +      assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
>
>       // Copy the string over
> -      if (CopyStringFragment(StringRef(ThisTokBuf,ThisTokEnd-ThisTokBuf)))
> -      {
> +      if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
>         if (DiagnoseBadString(StringToks[i]))
>           hadError = true;
> -      }
> -
>     } else {
>       assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
>       ++ThisTokBuf; // skip "
> @@ -1204,11 +1201,9 @@
>           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
>
>           // Copy the character span over.
> -          if (CopyStringFragment(StringRef(InStart,ThisTokBuf-InStart)))
> -          {
> +          if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
>             if (DiagnoseBadString(StringToks[i]))
>               hadError = true;
> -          }
>           continue;
>         }
>         // Is this a Universal Character Name escape?
> @@ -1292,8 +1287,8 @@
>   ConversionResult result = conversionOK;
>   // Copy the character span over.
>   if (CharByteWidth == 1) {
> -    if (!isLegalUTF8Sequence(reinterpret_cast<const UTF8*>(Fragment.begin()),
> -                             reinterpret_cast<const UTF8*>(Fragment.end())))
> +    if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()),
> +                           reinterpret_cast<const UTF8*>(Fragment.end())))

Ah, I think that one is my fault... thanks for spotting it.

-Eli