[cfe-commits] r39390 - in /cfe/cfe/trunk: Lex/LiteralSupport.cpp include/clang/Lex/LiteralSupport.h

Wed Jul 11 09:43:52 PDT 2007

Author: clattner
Date: Wed Jul 11 11:43:52 2007
New Revision: 39390

URL: http://llvm.org/viewvc/llvm-project?rev=39390&view=rev
Log:
Implement support for performing semantic analysis of character literals.

Modified:
    cfe/cfe/trunk/Lex/LiteralSupport.cpp
    cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h

Modified: cfe/cfe/trunk/Lex/LiteralSupport.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/Lex/LiteralSupport.cpp?rev=39390&r1=39389&r2=39390&view=diff

==============================================================================

--- cfe/cfe/trunk/Lex/LiteralSupport.cpp (original)
+++ cfe/cfe/trunk/Lex/LiteralSupport.cpp Wed Jul 11 11:43:52 2007
@@ -1,4 +1,4 @@
-//===--- LiteralSupport.cpp - Code to parse and process literals-*- C++ -*-===//
+//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file implements the NumericLiteralParser interface.
+// This file implements the NumericLiteralParser, CharLiteralParser, and
+// StringLiteralParser interfaces.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,6 +21,103 @@
 using namespace llvm;
 using namespace clang;
 
+/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
+/// not valid.
+static int HexDigitValue(char C) {
+  if (C >= '0' && C <= '9') return C-'0';
+  if (C >= 'a' && C <= 'f') return C-'a'+10;
+  if (C >= 'A' && C <= 'F') return C-'A'+10;
+  return -1;
+}
+
+/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
+/// either a character or a string literal.
+static unsigned ProcessCharEscape(const char *&ThisTokBuf,
+                                  const char *ThisTokEnd, bool &HadError,
+                                  SourceLocation Loc, Preprocessor &PP) {
+  // Skip the '\' char.
+  ++ThisTokBuf;
+
+  // We know that this character can't be off the end of the buffer, because
+  // that would have been \", which would not have been the end of string.
+  unsigned ResultChar = *ThisTokBuf++;
+  switch (ResultChar) {
+  // These map to themselves.
+  case '\\': case '\'': case '"': case '?': break;
+    
+    // These have fixed mappings.
+  case 'a':
+    // TODO: K&R: the meaning of '\\a' is different in traditional C
+    ResultChar = 7;
+    break;
+  case 'b':
+    ResultChar = 8;
+    break;
+  case 'e':
+    PP.Diag(Loc, diag::ext_nonstandard_escape, "e");
+    ResultChar = 27;
+    break;
+  case 'f':
+    ResultChar = 12;
+    break;
+  case 'n':
+    ResultChar = 10;
+    break;
+  case 'r':
+    ResultChar = 13;
+    break;
+  case 't':
+    ResultChar = 9;
+    break;
+  case 'v':
+    ResultChar = 11;
+    break;
+    
+    //case 'u': case 'U':  // FIXME: UCNs.
+  case 'x': // Hex escape.
+    if (ThisTokBuf == ThisTokEnd ||
+        (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
+      PP.Diag(Loc, diag::err_hex_escape_no_digits);
+      HadError = 1;
+      ResultChar = 0;
+      break;
+    }
+    ++ThisTokBuf; // Consumed one hex digit.
+    
+    // FIXME: warn_hex_escape_too_large. '\x12345'
+    assert(0 && "hex escape: unimp!");
+    break;
+  case '0': case '1': case '2': case '3':
+  case '4': case '5': case '6': case '7':
+    // Octal escapes.
+    // FIXME: warn_octal_escape_too_large. '\012345'
+    assert(0 && "octal escape: unimp!");
+    break;
+    
+    // Otherwise, these are not valid escapes.
+  case '(': case '{': case '[': case '%':
+    // GCC accepts these as extensions.  We warn about them as such though.
+    if (!PP.getLangOptions().NoExtensions) {
+      PP.Diag(Loc, diag::ext_nonstandard_escape,
+              std::string()+(char)ResultChar);
+      break;
+    }
+    // FALL THROUGH.
+  default:
+    if (isgraph(ThisTokBuf[0])) {
+      PP.Diag(Loc, diag::ext_unknown_escape, std::string()+(char)ResultChar);
+    } else {
+      PP.Diag(Loc, diag::ext_unknown_escape, "x"+utohexstr(ResultChar));
+    }
+    break;
+  }
+  
+  return ResultChar;
+}
+
+
+
+
 ///       integer-constant: [C99 6.4.4.1]
 ///         decimal-constant integer-suffix
 ///         octal-constant integer-suffix
@@ -61,9 +159,8 @@
 
 NumericLiteralParser::
 NumericLiteralParser(const char *begin, const char *end,
-                     SourceLocation TokLoc, Preprocessor &pp) :
-  PP(pp), ThisTokBegin(begin), ThisTokEnd(end)
-{
+                     SourceLocation TokLoc, Preprocessor &pp)
+  : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
   s = DigitsBegin = begin;
   saw_exponent = false;
   saw_period = false;
@@ -217,16 +314,6 @@
   }
 }
 
-static unsigned HexLetterToVal(char c) {
-  if (c >= '0' && c <= '9')
-    return c - '0';
-  else if (c >= 'A' && c <= 'F') 
-    return c - 'A' - 10; 
-  else
-    assert(c >= 'a' && c <= 'f' && "Lexer scanning error");
-  return c - 'a' - 10;
-}
-
 bool NumericLiteralParser::GetIntegerValue(uintmax_t &val) {
   uintmax_t max_value = UINTMAX_MAX / radix;
   unsigned max_digit = UINTMAX_MAX % radix;
@@ -234,7 +321,7 @@
   val = 0;
   s = DigitsBegin;
   while (s < SuffixBegin) {
-    unsigned C = HexLetterToVal(*s++);
+    unsigned C = HexDigitValue(*s++);
     
     if (val > max_value || (val == max_value && C > max_digit)) {
       return false; // Overflow!
@@ -253,7 +340,7 @@
   val = 0;
   s = DigitsBegin;
   while (s < SuffixBegin) {
-    unsigned C = HexLetterToVal(*s++);
+    unsigned C = HexDigitValue(*s++);
     
     if (val > max_value || (val == max_value && C > max_digit)) {
       return false; // Overflow!
@@ -278,7 +365,7 @@
   
   bool OverflowOccurred = false;
   while (s < SuffixBegin) {
-    unsigned C = HexLetterToVal(*s++);
+    unsigned C = HexDigitValue(*s++);
     
     // If this letter is out of bound for this radix, reject it.
     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
@@ -309,6 +396,80 @@
   hadError = true;
 }
 
+
+CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
+                                     SourceLocation Loc, Preprocessor &PP) {
+  // At this point we know that the character matches the regex "L?'.*'".
+  HadError = false;
+  Value = 0;
+  
+  // Determine if this is a wide character.
+  IsWide = begin[0] == 'L';
+  if (IsWide) ++begin;
+  
+  // Skip over the entry quote.
+  assert(begin[0] == '\'' && "Invalid token lexed");
+  ++begin;
+
+  // FIXME: This assumes that 'int' is 32-bits in overflow calculation, and the
+  // size of "value".
+  assert(PP.getTargetInfo().getIntWidth(Loc) == 32 &&
+         "Assumes sizeof(int) == 4 for now");
+  // FIXME: This assumes that wchar_t is 32-bits for now.
+  assert(PP.getTargetInfo().getWCharWidth(Loc) == 32 && 
+         "Assumes sizeof(wchar_t) == 4 for now");
+  // FIXME: This extensively assumes that 'char' is 8-bits.
+  assert(PP.getTargetInfo().getCharWidth(Loc) == 8 &&
+         "Assumes char is 8 bits");
+  
+  bool isFirstChar = true;
+  bool isMultiChar = false;
+  while (begin[0] != '\'') {
+    unsigned ResultChar;
+    if (begin[0] != '\\')     // If this is a normal character, consume it.
+      ResultChar = *begin++;
+    else                      // Otherwise, this is an escape character.
+      ResultChar = ProcessCharEscape(begin, end, HadError, Loc, PP);
+
+    // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
+    // implementation defined (C99 6.4.4.4p10).
+    if (!isFirstChar) {
+      // If this is the second character being processed, do special handling.
+      if (!isMultiChar) {
+        isMultiChar = true;
+      
+        // Warn about discarding the top bits for multi-char wide-character
+        // constants (L'abcd').
+        if (IsWide)
+          PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
+      }
+
+      if (IsWide) {
+        // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
+        Value = 0;
+      } else {
+        // Narrow character literals act as though their value is concatenated
+        // in this implementation.
+        if (((Value << 8) >> 8) != Value)
+          PP.Diag(Loc, diag::warn_char_constant_too_large);
+        Value <<= 8;
+      }
+    }
+    
+    Value += ResultChar;
+    isFirstChar = false;
+  }
+  
+  // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
+  // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
+  // character constants are not sign extended in the this implementation:
+  // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
+  if (!IsWide && !isMultiChar && (Value & 128) &&
+      PP.getTargetInfo().isCharSigned(Loc))
+    Value = (signed char)Value;
+}
+
+
 ///       string-literal: [C99 6.4.5]
 ///          " [s-char-sequence] "
 ///         L" [s-char-sequence] "
@@ -342,12 +503,11 @@
 ///         \U hex-quad hex-quad
 ///       hex-quad:
 ///         hex-digit hex-digit hex-digit hex-digit
-
+///
 StringLiteralParser::
 StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
-                    Preprocessor &pp, TargetInfo &t) : 
-  PP(pp), Target(t) 
-{
+                    Preprocessor &pp, TargetInfo &t)
+  : PP(pp), Target(t) {
   // Scan all of the string portions, remember the max individual token length,
   // computing a bound on the concatenated string length, and see whether any
   // piece is a wide-string.  If any of the string portions is a wide-string
@@ -357,8 +517,9 @@
   AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
   
   hadError = false;
-  
-  // The common case is that there is only one string fragment.
+
+  // Implement Translation Phase #6: concatenation of string literals
+  /// (C99 5.1.1.2p1).  The common case is only one string fragment.
   for (unsigned i = 1; i != NumStringToks; ++i) {
     // The string could be shorter than this if it needs cleaning, but this is a
     // reasonable bound, which is all we need.
@@ -381,8 +542,11 @@
   // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
   // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
   wchar_tByteWidth = ~0U;
-  if (AnyWide)
+  if (AnyWide) {
     wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
+    assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
+    wchar_tByteWidth /= 8;
+  }
   
   // The output buffer size needs to be large enough to hold wide characters.
   // This is a worst-case assumption which basically corresponds to L"" "long".
@@ -441,80 +605,9 @@
         continue;
       }
       
-      // Otherwise, this is an escape character.  Skip the '\' char.
-      ++ThisTokBuf;
-      
-      // We know that this character can't be off the end of the buffer, because
-      // that would have been \", which would not have been the end of string.
-      unsigned ResultChar = *ThisTokBuf++;
-      switch (ResultChar) {
-        // These map to themselves.
-      case '\\': case '\'': case '"': case '?': break;
-        
-        // These have fixed mappings.
-      case 'a':
-        // TODO: K&R: the meaning of '\\a' is different in traditional C
-        ResultChar = 7;
-        break;
-      case 'b':
-        ResultChar = 8;
-        break;
-      case 'e':
-        Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
-        ResultChar = 27;
-        break;
-      case 'f':
-        ResultChar = 12;
-        break;
-      case 'n':
-        ResultChar = 10;
-        break;
-      case 'r':
-        ResultChar = 13;
-        break;
-      case 't':
-        ResultChar = 9;
-        break;
-      case 'v':
-        ResultChar = 11;
-        break;
-        
-        //case 'u': case 'U':  // FIXME: UCNs.
-      case 'x': // Hex escape.
-        if (ThisTokBuf == ThisTokEnd ||
-            (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
-          Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
-          ResultChar = 0;
-          break;
-        }
-        ++ThisTokBuf; // Consumed one hex digit.
-        
-        assert(0 && "hex escape: unimp!");
-        break;
-      case '0': case '1': case '2': case '3':
-      case '4': case '5': case '6': case '7':
-        // Octal escapes.
-        assert(0 && "octal escape: unimp!");
-        break;
-        
-        // Otherwise, these are not valid escapes.
-      case '(': case '{': case '[': case '%':
-        // GCC accepts these as extensions.  We warn about them as such though.
-        if (!PP.getLangOptions().NoExtensions) {
-          Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
-               std::string()+(char)ResultChar);
-          break;
-        }
-        // FALL THROUGH.
-      default:
-        if (isgraph(ThisTokBuf[0])) {
-          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
-               std::string()+(char)ResultChar);
-        } else {
-          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
-               "x"+utohexstr(ResultChar));
-        }
-      }
+      // Otherwise, this is an escape character.  Process it.
+      unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+                                              StringToks[i].getLocation(), PP);
       
       // Note: our internal rep of wide char tokens is always little-endian.
       *ResultPtr++ = ResultChar & 0xFF;
@@ -533,10 +626,3 @@
     *ResultPtr++ = 0;
   }
 }
-
-void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID, 
-                               const std::string &M) {
-  PP.Diag(Loc, DiagID, M);
-  hadError = true;
-}
-

Modified: cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h?rev=39390&r1=39389&r2=39390&view=diff

==============================================================================
--- cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h (original)
+++ cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h Wed Jul 11 11:43:52 2007
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines the NumericLiteralParser interface.
+// This file defines the NumericLiteralParser, CharLiteralParser, and
+// StringLiteralParser interfaces.
 //
 //===----------------------------------------------------------------------===//
 
@@ -27,6 +28,9 @@
 class SourceLocation;
 class TargetInfo;
     
+/// NumericLiteralParser - This performs strict semantic analysis of the content
+/// of a ppnumber, classifying it as either integer, floating, or erroneous,
+/// determines the radix of the value and can convert it to a useful value.
 class NumericLiteralParser {
   Preprocessor &PP; // needed for diagnostics
   
@@ -102,6 +106,24 @@
   }
 };
 
+/// CharLiteralParser - Perform interpretation and semantic analysis of a
+/// character literal.
+class CharLiteralParser {
+  unsigned Value;
+  bool IsWide;
+  bool HadError;
+public:
+  CharLiteralParser(const char *begin, const char *end,
+                    SourceLocation Loc, Preprocessor &PP);
+
+  bool hadError() const { return HadError; }
+  bool isWide() const { return IsWide; }
+  unsigned getValue() const { return Value; }
+};
+
+/// StringLiteralParser - This decodes string escape characters and performs
+/// wide string analysis and Translation Phase #6 (concatenation of string
+/// literals) (C99 5.1.1.2p1).
 class StringLiteralParser {
   Preprocessor &PP;
   TargetInfo &Target;
@@ -119,18 +141,6 @@
   
   const char *GetString() { return &ResultBuf[0]; }
   unsigned GetStringLength() { return ResultPtr-&ResultBuf[0]; }
-private:
-  void Diag(SourceLocation Loc, unsigned DiagID, 
-            const std::string &M = std::string());
-
-  /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
-  /// not valid.
-  static int HexDigitValue(char C) {
-    if (C >= '0' && C <= '9') return C-'0';
-    if (C >= 'a' && C <= 'f') return C-'a'+10;
-    if (C >= 'A' && C <= 'F') return C-'A'+10;
-    return -1;
-  }
 };
   
 }  // end namespace clang