[cfe-commits] r166900 - in /cfe/trunk: lib/Lex/LiteralSupport.cpp test/Misc/wrong-encoding.c

Tue Oct 30 16:23:34 PDT 2012

On Sun, Oct 28, 2012 at 11:24 AM, Seth Cantrell <seth.cantrell at gmail.com> wrote:
> Author: socantre
> Date: Sun Oct 28 13:24:46 2012
> New Revision: 166900
>
> URL: http://llvm.org/viewvc/llvm-project?rev=166900&view=rev
> Log:
> improve highlighting of invalid string encodings
>
> limit highlight to exactly the bad encoding, and highlight every
> bad encoding in a string.
>
> Modified:
>     cfe/trunk/lib/Lex/LiteralSupport.cpp
>     cfe/trunk/test/Misc/wrong-encoding.c
>
> Modified: cfe/trunk/lib/Lex/LiteralSupport.cpp
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/LiteralSupport.cpp?rev=166900&r1=166899&r2=166900&view=diff
> ==============================================================================
> --- cfe/trunk/lib/Lex/LiteralSupport.cpp (original)
> +++ cfe/trunk/lib/Lex/LiteralSupport.cpp Sun Oct 28 13:24:46 2012
> @@ -49,6 +49,20 @@
>    }
>  }
>
> +static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
> +                                           FullSourceLoc TokLoc,
> +                                           const char *TokBegin,
> +                                           const char *TokRangeBegin,
> +                                           const char *TokRangeEnd) {
> +  SourceLocation Begin =
> +    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
> +                                   TokLoc.getManager(), Features);
> +  SourceLocation End =
> +    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
> +                                   TokLoc.getManager(), Features);
> +  return CharSourceRange::getCharRange(Begin, End);
> +}
> +
>  /// \brief Produce a diagnostic highlighting some portion of a literal.
>  ///
>  /// Emits the diagnostic \p DiagID, highlighting the range of characters from
> @@ -61,11 +75,8 @@
>    SourceLocation Begin =
>      Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
>                                     TokLoc.getManager(), Features);
> -  SourceLocation End =
> -    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
> -                                   TokLoc.getManager(), Features);
> -  return Diags->Report(Begin, DiagID)
> -      << CharSourceRange::getCharRange(Begin, End);
> +  return Diags->Report(Begin, DiagID) <<
> +    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
>  }
>
>  /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
> @@ -1372,6 +1383,15 @@
>    }
>  }
>
> +static const char *resync_utf8(const char *err, const char *end) {
> +    if (err==end)
> +        return end;
> +    end = err + std::min<unsigned>(getNumBytesForUTF8(*err), end-err);
> +    while (++err!=end && (*err&0xC0)==0x80)
> +      ;
> +    return err;
> +}
> +
>  /// \brief This function copies from Fragment, which is a sequence of bytes
>  /// within Tok's contents (which begin at TokBegin) into ResultPtr.
>  /// Performs widening for multi-byte characters.
> @@ -1381,7 +1401,6 @@
>    const UTF8 *ErrorPtrTmp;
>    if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
>      return false;
> -  const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
>
>    // If we see bad encoding for unprefixed string literals, warn and
>    // simply copy the byte values, for compatibility with gcc and older
> @@ -1391,12 +1410,31 @@
>      memcpy(ResultPtr, Fragment.data(), Fragment.size());
>      ResultPtr += Fragment.size();
>    }
> +
>    if (Diags) {
> -    Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
> -         ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
> -                                                 Fragment.end() - ErrorPtr),
> -         NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
> -                              : diag::err_bad_string_encoding);
> +    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
> +
> +    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
> +    const DiagnosticBuilder &Builder =
> +      Diag(Diags, Features, SourceLoc, TokBegin,
> +           ErrorPtr, resync_utf8(ErrorPtr, Fragment.end()),
> +           NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
> +                                : diag::err_bad_string_encoding);
> +
> +    char *SavedResultPtr = ResultPtr;
> +    const char *NextStart = resync_utf8(ErrorPtr, Fragment.end());
> +    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
> +
> +    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, ResultPtr,
> +                              ErrorPtrTmp)) {
> +      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
> +      NextStart = resync_utf8(ErrorPtr, Fragment.end());
> +      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
> +                                     ErrorPtr, NextStart);

This fails when you exceed the bounds of the DiagnosticBuilder's
SourceRange range. I fixed this (by limiting the number of ranges we
add to the diagnostic) in r167059. Feel free to try other solutions if
that one isn't suitable in some way.

> +      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
> +    }
> +
> +    ResultPtr = SavedResultPtr;
>    }
>    return !NoErrorOnBadEncoding;
>  }
>
> Modified: cfe/trunk/test/Misc/wrong-encoding.c
> URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Misc/wrong-encoding.c?rev=166900&r1=166899&r2=166900&view=diff
> ==============================================================================
> --- cfe/trunk/test/Misc/wrong-encoding.c (original)
> +++ cfe/trunk/test/Misc/wrong-encoding.c Sun Oct 28 13:24:46 2012
> @@ -1,16 +1,33 @@
> -// RUN: %clang_cc1 -fsyntax-only %s 2>&1 | FileCheck -strict-whitespace %s
> +// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value %s 2>&1 | FileCheck -strict-whitespace %s
>
>  void foo() {
>
>    "§Ã"; // ø
>  // CHECK: {{^  "<A7><C3>"; // <F8>}}
> -// CHECK: {{^   \^~~~}}
> +// CHECK: {{^   \^~~~~~~}}
>
>    /* þ« */ const char *d = "¥";
>
>  // CHECK: {{^  /\* <FE><AB> \*/ const char \*d = "<A5>";}}
>  // CHECK: {{^                                  \^~~~}}
>
> -// CHECK: {{^  "<A7><C3>"; // <F8>}}
> -// CHECK: {{^  \^~~~~~~~~~}}
> +  "xxé¿¿¿d";
> +// CHECK: {{^  "xx<U\+9FFF><BF>d";}}
> +// CHECK: {{^             \^~~~}}
> +
> +  "xxé¿bcd";
> +// CHECK: {{^  "xx<E9><BF>bcd";}}
> +// CHECK: {{^     \^~~~~~~~}}
> +
> +  "xxéabcd";
> +// CHECK: {{^  "xx<E9>abcd";}}
> +// CHECK: {{^     \^~~~}}
> +
> +  "xxé¿é¿d";
> +// CHECK: {{^  "xx<E9><BF><E9><BF>d";}}
> +// CHECK: {{^     \^~~~~~~~~~~~~~~}}
> +
> +  "xxé¿xxxxxxxxxxxxxxxxxxxxxé¿xx";
> +// CHECK: {{^  "xx<E9><BF>xxxxxxxxxxxxxxxxxxxxx<E9><BF>xx";}}
> +// CHECK: {{^     \^~~~~~~~                     ~~~~~~~~}}
>  }
>
>
> _______________________________________________
> cfe-commits mailing list
> cfe-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits