[cfe-commits] r39354 - in /cfe/cfe/trunk: AST/SemaExpr.cpp Lex/LiteralSupport.cpp Sema/SemaExpr.cpp include/clang/Lex/LiteralSupport.h

snaroff at cs.uiuc.edu snaroff at cs.uiuc.edu
Wed Jul 11 09:43:31 PDT 2007


Author: snaroff
Date: Wed Jul 11 11:43:31 2007
New Revision: 39354

URL: http://llvm.org/viewvc/llvm-project?rev=39354&view=rev
Log:
Bug #:
Submitted by:
Reviewed by:
Move string literal parsing from Sema=>LiteralSupport. This consolidates
all the quirky parsing code within the Lexer subsystem (yeah!). This
simplifies Sema and (more importantly) allows future parsers
(i.e. subclasses of Action) to benefit from this code.

Modified:
    cfe/cfe/trunk/AST/SemaExpr.cpp
    cfe/cfe/trunk/Lex/LiteralSupport.cpp
    cfe/cfe/trunk/Sema/SemaExpr.cpp
    cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h

Modified: cfe/cfe/trunk/AST/SemaExpr.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/AST/SemaExpr.cpp?rev=39354&r1=39353&r2=39354&view=diff

==============================================================================
--- cfe/cfe/trunk/AST/SemaExpr.cpp (original)
+++ cfe/cfe/trunk/AST/SemaExpr.cpp Wed Jul 11 11:43:31 2007
@@ -22,21 +22,9 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 using namespace clang;
 
-#include <iostream>
-
-/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
-/// not valid.
-static int HexDigitValue(char C) {
-  if (C >= '0' && C <= '9') return C-'0';
-  if (C >= 'a' && C <= 'f') return C-'a'+10;
-  if (C >= 'A' && C <= 'F') return C-'A'+10;
-  return -1;
-}
-
 /// ParseStringLiteral - The specified tokens were lexed as pasted string
 /// fragments (e.g. "foo" "bar" L"baz").  The result string has to handle string
 /// concatenation ([C99 5.1.1.2, translation phase #6]), so it may come from
@@ -47,198 +35,18 @@
 Sema::ParseStringLiteral(const LexerToken *StringToks, unsigned NumStringToks) {
   assert(NumStringToks && "Must have at least one string!");
 
-  // Scan all of the string portions, remember the max individual token length,
-  // computing a bound on the concatenated string length, and see whether any
-  // piece is a wide-string.  If any of the string portions is a wide-string
-  // literal, the result is a wide-string literal [C99 6.4.5p4].
-  unsigned MaxTokenLength = StringToks[0].getLength();
-  unsigned SizeBound = StringToks[0].getLength()-2;  // -2 for "".
-  bool AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
-  
-  // The common case is that there is only one string fragment.
-  for (unsigned i = 1; i != NumStringToks; ++i) {
-    // The string could be shorter than this if it needs cleaning, but this is a
-    // reasonable bound, which is all we need.
-    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
-
-    // Remember maximum string piece length.
-    if (StringToks[i].getLength() > MaxTokenLength) 
-      MaxTokenLength = StringToks[i].getLength();
-    
-    // Remember if we see any wide strings.
-    AnyWide |= StringToks[i].getKind() == tok::wide_string_literal;
-  }
-  
-  
-  // Include space for the null terminator.
-  ++SizeBound;
-  
-  // TODO: K&R warning: "traditional C rejects string constant concatenation"
-  
-  // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
-  // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
-  unsigned wchar_tByteWidth = ~0U;
-  if (AnyWide)
-    wchar_tByteWidth =Context.Target.getWCharWidth(StringToks[0].getLocation());
-  
-  // The output buffer size needs to be large enough to hold wide characters.
-  // This is a worst-case assumption which basically corresponds to L"" "long".
-  if (AnyWide)
-    SizeBound *= wchar_tByteWidth;
-  
-  // Create a temporary buffer to hold the result string data.
-  SmallString<512> ResultBuf;
-  ResultBuf.resize(SizeBound);
-  
-  // Likewise, but for each string piece.
-  SmallString<512> TokenBuf;
-  TokenBuf.resize(MaxTokenLength);
-  
-  // Loop over all the strings, getting their spelling, and expanding them to
-  // wide strings as appropriate.
-  char *ResultPtr = &ResultBuf[0];   // Next byte to fill in.
-  
-  for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
-    const char *ThisTokBuf = &TokenBuf[0];
-    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
-    // that ThisTokBuf points to a buffer that is big enough for the whole token
-    // and 'spelled' tokens can only shrink.
-    unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
-    const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
-    
-    // TODO: Input character set mapping support.
-    
-    // Skip L marker for wide strings.
-    if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
-    
-    assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
-    ++ThisTokBuf;
-    
-    while (ThisTokBuf != ThisTokEnd) {
-      // Is this a span of non-escape characters?
-      if (ThisTokBuf[0] != '\\') {
-        const char *InStart = ThisTokBuf;
-        do {
-          ++ThisTokBuf;
-        } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
-        
-        // Copy the character span over.
-        unsigned Len = ThisTokBuf-InStart;
-        if (!AnyWide) {
-          memcpy(ResultPtr, InStart, Len);
-          ResultPtr += Len;
-        } else {
-          // Note: our internal rep of wide char tokens is always little-endian.
-          for (; Len; --Len, ++InStart) {
-            *ResultPtr++ = InStart[0];
-            // Add zeros at the end.
-            for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-              *ResultPtr++ = 0;
-          }
-        }
-        continue;
-      }
-      
-      // Otherwise, this is an escape character.  Skip the '\' char.
-      ++ThisTokBuf;
-      
-      // We know that this character can't be off the end of the buffer, because
-      // that would have been \", which would not have been the end of string.
-      unsigned ResultChar = *ThisTokBuf++;
-      switch (ResultChar) {
-      // These map to themselves.
-      case '\\': case '\'': case '"': case '?': break;
-        
-      // These have fixed mappings.
-      case 'a':
-        // TODO: K&R: the meaning of '\\a' is different in traditional C
-        ResultChar = 7;
-        break;
-      case 'b':
-        ResultChar = 8;
-        break;
-      case 'e':
-        Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
-        ResultChar = 27;
-        break;
-      case 'f':
-        ResultChar = 12;
-        break;
-      case 'n':
-        ResultChar = 10;
-        break;
-      case 'r':
-        ResultChar = 13;
-        break;
-      case 't':
-        ResultChar = 9;
-        break;
-      case 'v':
-        ResultChar = 11;
-        break;
-        
-      //case 'u': case 'U':  // FIXME: UCNs.
-      case 'x': // Hex escape.
-        if (ThisTokBuf == ThisTokEnd ||
-            (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
-          Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
-          ResultChar = 0;
-          break;
-        }
-        ++ThisTokBuf; // Consumed one hex digit.
-        
-        assert(0 && "hex escape: unimp!");
-        break;
-      case '0': case '1': case '2': case '3':
-      case '4': case '5': case '6': case '7':
-        // Octal escapes.
-        assert(0 && "octal escape: unimp!");
-        break;
-        
-      // Otherwise, these are not valid escapes.
-      case '(': case '{': case '[': case '%':
-        // GCC accepts these as extensions.  We warn about them as such though.
-        if (!PP.getLangOptions().NoExtensions) {
-          Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
-               std::string()+(char)ResultChar);
-          break;
-        }
-        // FALL THROUGH.
-      default:
-        if (isgraph(ThisTokBuf[0])) {
-          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
-               std::string()+(char)ResultChar);
-        } else {
-          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
-               "x"+utohexstr(ResultChar));
-        }
-      }
-
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultPtr++ = ResultChar & 0xFF;
-      
-      if (AnyWide) {
-        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-          *ResultPtr++ = ResultChar >> i*8;
-      }
-    }
-  }
-  
-  // Add zero terminator.
-  *ResultPtr = 0;
-  if (AnyWide) {
-    for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-      *ResultPtr++ = 0;
-  }
-  
+  StringLiteralParser Literal(StringToks, NumStringToks, PP, Context.Target);
+  if (Literal.hadError)
+    return ExprResult(true);
+
   SmallVector<SourceLocation, 4> StringTokLocs;
   for (unsigned i = 0; i != NumStringToks; ++i)
     StringTokLocs.push_back(StringToks[i].getLocation());
-  
+    
   // FIXME: use factory.
-  
   // Pass &StringTokLocs[0], StringTokLocs.size() to factory!
-  return new StringLiteral(&ResultBuf[0], ResultPtr-&ResultBuf[0], AnyWide);
+  return new StringLiteral(Literal.GetString(), Literal.GetStringLength(), 
+                           Literal.AnyWide);
 }
 
 

Modified: cfe/cfe/trunk/Lex/LiteralSupport.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/Lex/LiteralSupport.cpp?rev=39354&r1=39353&r2=39354&view=diff

==============================================================================
--- cfe/cfe/trunk/Lex/LiteralSupport.cpp (original)
+++ cfe/cfe/trunk/Lex/LiteralSupport.cpp Wed Jul 11 11:43:31 2007
@@ -15,6 +15,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/Diagnostic.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
 using namespace clang;
@@ -273,3 +274,233 @@
   PP.Diag(Loc, DiagID, M);
   hadError = true;
 }
+
+///       string-literal: [C99 6.4.5]
+///          " [s-char-sequence] "
+///         L" [s-char-sequence] "
+///       s-char-sequence:
+///         s-char
+///         s-char-sequence s-char
+///       s-char:
+///         any source character except the double quote ",
+///           backslash \, or newline character
+///         escape-character
+///         universal-character-name
+///       escape-character: [C99 6.4.4.4]
+///         \ escape-code
+///         universal-character-name
+///       escape-code:
+///         character-escape-code
+///         octal-escape-code
+///         hex-escape-code
+///       character-escape-code: one of
+///         n t b r f v a
+///         \ ' " ?
+///       octal-escape-code:
+///         octal-digit
+///         octal-digit octal-digit
+///         octal-digit octal-digit octal-digit
+///       hex-escape-code:
+///         x hex-digit
+///         hex-escape-code hex-digit
+///       universal-character-name:
+///         \u hex-quad
+///         \U hex-quad hex-quad
+///       hex-quad:
+///         hex-digit hex-digit hex-digit hex-digit
+
+StringLiteralParser::
+StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
+                    Preprocessor &pp, TargetInfo &t) : 
+  PP(pp), Target(t) 
+{
+  // Scan all of the string portions, remember the max individual token length,
+  // computing a bound on the concatenated string length, and see whether any
+  // piece is a wide-string.  If any of the string portions is a wide-string
+  // literal, the result is a wide-string literal [C99 6.4.5p4].
+  MaxTokenLength = StringToks[0].getLength();
+  SizeBound = StringToks[0].getLength()-2;  // -2 for "".
+  AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
+  
+  // The common case is that there is only one string fragment.
+  for (unsigned i = 1; i != NumStringToks; ++i) {
+    // The string could be shorter than this if it needs cleaning, but this is a
+    // reasonable bound, which is all we need.
+    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
+    
+    // Remember maximum string piece length.
+    if (StringToks[i].getLength() > MaxTokenLength) 
+      MaxTokenLength = StringToks[i].getLength();
+    
+    // Remember if we see any wide strings.
+    AnyWide |= StringToks[i].getKind() == tok::wide_string_literal;
+  }
+  
+  
+  // Include space for the null terminator.
+  ++SizeBound;
+  
+  // TODO: K&R warning: "traditional C rejects string constant concatenation"
+  
+  // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
+  // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
+  wchar_tByteWidth = ~0U;
+  if (AnyWide)
+    wchar_tByteWidth = Target.getWCharWidth(StringToks[0].getLocation());
+  
+  // The output buffer size needs to be large enough to hold wide characters.
+  // This is a worst-case assumption which basically corresponds to L"" "long".
+  if (AnyWide)
+    SizeBound *= wchar_tByteWidth;
+  
+  // Size the temporary buffer to hold the result string data.
+  ResultBuf.resize(SizeBound);
+  
+  // Likewise, but for each string piece.
+  SmallString<512> TokenBuf;
+  TokenBuf.resize(MaxTokenLength);
+  
+  // Loop over all the strings, getting their spelling, and expanding them to
+  // wide strings as appropriate.
+  ResultPtr = &ResultBuf[0];   // Next byte to fill in.
+  
+  for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
+    const char *ThisTokBuf = &TokenBuf[0];
+    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
+    // that ThisTokBuf points to a buffer that is big enough for the whole token
+    // and 'spelled' tokens can only shrink.
+    unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
+    const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
+    
+    // TODO: Input character set mapping support.
+    
+    // Skip L marker for wide strings.
+    if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
+    
+    assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
+    ++ThisTokBuf;
+    
+    while (ThisTokBuf != ThisTokEnd) {
+      // Is this a span of non-escape characters?
+      if (ThisTokBuf[0] != '\\') {
+        const char *InStart = ThisTokBuf;
+        do {
+          ++ThisTokBuf;
+        } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
+        
+        // Copy the character span over.
+        unsigned Len = ThisTokBuf-InStart;
+        if (!AnyWide) {
+          memcpy(ResultPtr, InStart, Len);
+          ResultPtr += Len;
+        } else {
+          // Note: our internal rep of wide char tokens is always little-endian.
+          for (; Len; --Len, ++InStart) {
+            *ResultPtr++ = InStart[0];
+            // Add zeros at the end.
+            for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+            *ResultPtr++ = 0;
+          }
+        }
+        continue;
+      }
+      
+      // Otherwise, this is an escape character.  Skip the '\' char.
+      ++ThisTokBuf;
+      
+      // We know that this character can't be off the end of the buffer, because
+      // that would have been \", which would not have been the end of string.
+      unsigned ResultChar = *ThisTokBuf++;
+      switch (ResultChar) {
+        // These map to themselves.
+      case '\\': case '\'': case '"': case '?': break;
+        
+        // These have fixed mappings.
+      case 'a':
+        // TODO: K&R: the meaning of '\\a' is different in traditional C
+        ResultChar = 7;
+        break;
+      case 'b':
+        ResultChar = 8;
+        break;
+      case 'e':
+        Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
+        ResultChar = 27;
+        break;
+      case 'f':
+        ResultChar = 12;
+        break;
+      case 'n':
+        ResultChar = 10;
+        break;
+      case 'r':
+        ResultChar = 13;
+        break;
+      case 't':
+        ResultChar = 9;
+        break;
+      case 'v':
+        ResultChar = 11;
+        break;
+        
+        //case 'u': case 'U':  // FIXME: UCNs.
+      case 'x': // Hex escape.
+        if (ThisTokBuf == ThisTokEnd ||
+            (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
+          Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
+          ResultChar = 0;
+          break;
+        }
+        ++ThisTokBuf; // Consumed one hex digit.
+        
+        assert(0 && "hex escape: unimp!");
+        break;
+      case '0': case '1': case '2': case '3':
+      case '4': case '5': case '6': case '7':
+        // Octal escapes.
+        assert(0 && "octal escape: unimp!");
+        break;
+        
+        // Otherwise, these are not valid escapes.
+      case '(': case '{': case '[': case '%':
+        // GCC accepts these as extensions.  We warn about them as such though.
+        if (!PP.getLangOptions().NoExtensions) {
+          Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
+               std::string()+(char)ResultChar);
+          break;
+        }
+        // FALL THROUGH.
+      default:
+        if (isgraph(ThisTokBuf[0])) {
+          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
+               std::string()+(char)ResultChar);
+        } else {
+          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
+               "x"+utohexstr(ResultChar));
+        }
+      }
+      
+      // Note: our internal rep of wide char tokens is always little-endian.
+      *ResultPtr++ = ResultChar & 0xFF;
+      
+      if (AnyWide) {
+        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+          *ResultPtr++ = ResultChar >> i*8;
+      }
+    }
+  }
+  
+  // Add zero terminator.
+  *ResultPtr = 0;
+  if (AnyWide) {
+    for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+    *ResultPtr++ = 0;
+  }
+}
+
+void StringLiteralParser::Diag(SourceLocation Loc, unsigned DiagID, 
+                               const std::string &M) {
+  PP.Diag(Loc, DiagID, M);
+  hadError = true;
+}
+

Modified: cfe/cfe/trunk/Sema/SemaExpr.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/Sema/SemaExpr.cpp?rev=39354&r1=39353&r2=39354&view=diff

==============================================================================
--- cfe/cfe/trunk/Sema/SemaExpr.cpp (original)
+++ cfe/cfe/trunk/Sema/SemaExpr.cpp Wed Jul 11 11:43:31 2007
@@ -22,21 +22,9 @@
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 using namespace clang;
 
-#include <iostream>
-
-/// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
-/// not valid.
-static int HexDigitValue(char C) {
-  if (C >= '0' && C <= '9') return C-'0';
-  if (C >= 'a' && C <= 'f') return C-'a'+10;
-  if (C >= 'A' && C <= 'F') return C-'A'+10;
-  return -1;
-}
-
 /// ParseStringLiteral - The specified tokens were lexed as pasted string
 /// fragments (e.g. "foo" "bar" L"baz").  The result string has to handle string
 /// concatenation ([C99 5.1.1.2, translation phase #6]), so it may come from
@@ -47,198 +35,18 @@
 Sema::ParseStringLiteral(const LexerToken *StringToks, unsigned NumStringToks) {
   assert(NumStringToks && "Must have at least one string!");
 
-  // Scan all of the string portions, remember the max individual token length,
-  // computing a bound on the concatenated string length, and see whether any
-  // piece is a wide-string.  If any of the string portions is a wide-string
-  // literal, the result is a wide-string literal [C99 6.4.5p4].
-  unsigned MaxTokenLength = StringToks[0].getLength();
-  unsigned SizeBound = StringToks[0].getLength()-2;  // -2 for "".
-  bool AnyWide = StringToks[0].getKind() == tok::wide_string_literal;
-  
-  // The common case is that there is only one string fragment.
-  for (unsigned i = 1; i != NumStringToks; ++i) {
-    // The string could be shorter than this if it needs cleaning, but this is a
-    // reasonable bound, which is all we need.
-    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
-
-    // Remember maximum string piece length.
-    if (StringToks[i].getLength() > MaxTokenLength) 
-      MaxTokenLength = StringToks[i].getLength();
-    
-    // Remember if we see any wide strings.
-    AnyWide |= StringToks[i].getKind() == tok::wide_string_literal;
-  }
-  
-  
-  // Include space for the null terminator.
-  ++SizeBound;
-  
-  // TODO: K&R warning: "traditional C rejects string constant concatenation"
-  
-  // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
-  // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
-  unsigned wchar_tByteWidth = ~0U;
-  if (AnyWide)
-    wchar_tByteWidth =Context.Target.getWCharWidth(StringToks[0].getLocation());
-  
-  // The output buffer size needs to be large enough to hold wide characters.
-  // This is a worst-case assumption which basically corresponds to L"" "long".
-  if (AnyWide)
-    SizeBound *= wchar_tByteWidth;
-  
-  // Create a temporary buffer to hold the result string data.
-  SmallString<512> ResultBuf;
-  ResultBuf.resize(SizeBound);
-  
-  // Likewise, but for each string piece.
-  SmallString<512> TokenBuf;
-  TokenBuf.resize(MaxTokenLength);
-  
-  // Loop over all the strings, getting their spelling, and expanding them to
-  // wide strings as appropriate.
-  char *ResultPtr = &ResultBuf[0];   // Next byte to fill in.
-  
-  for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
-    const char *ThisTokBuf = &TokenBuf[0];
-    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
-    // that ThisTokBuf points to a buffer that is big enough for the whole token
-    // and 'spelled' tokens can only shrink.
-    unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf);
-    const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
-    
-    // TODO: Input character set mapping support.
-    
-    // Skip L marker for wide strings.
-    if (ThisTokBuf[0] == 'L') ++ThisTokBuf;
-    
-    assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
-    ++ThisTokBuf;
-    
-    while (ThisTokBuf != ThisTokEnd) {
-      // Is this a span of non-escape characters?
-      if (ThisTokBuf[0] != '\\') {
-        const char *InStart = ThisTokBuf;
-        do {
-          ++ThisTokBuf;
-        } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
-        
-        // Copy the character span over.
-        unsigned Len = ThisTokBuf-InStart;
-        if (!AnyWide) {
-          memcpy(ResultPtr, InStart, Len);
-          ResultPtr += Len;
-        } else {
-          // Note: our internal rep of wide char tokens is always little-endian.
-          for (; Len; --Len, ++InStart) {
-            *ResultPtr++ = InStart[0];
-            // Add zeros at the end.
-            for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-              *ResultPtr++ = 0;
-          }
-        }
-        continue;
-      }
-      
-      // Otherwise, this is an escape character.  Skip the '\' char.
-      ++ThisTokBuf;
-      
-      // We know that this character can't be off the end of the buffer, because
-      // that would have been \", which would not have been the end of string.
-      unsigned ResultChar = *ThisTokBuf++;
-      switch (ResultChar) {
-      // These map to themselves.
-      case '\\': case '\'': case '"': case '?': break;
-        
-      // These have fixed mappings.
-      case 'a':
-        // TODO: K&R: the meaning of '\\a' is different in traditional C
-        ResultChar = 7;
-        break;
-      case 'b':
-        ResultChar = 8;
-        break;
-      case 'e':
-        Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape, "e");
-        ResultChar = 27;
-        break;
-      case 'f':
-        ResultChar = 12;
-        break;
-      case 'n':
-        ResultChar = 10;
-        break;
-      case 'r':
-        ResultChar = 13;
-        break;
-      case 't':
-        ResultChar = 9;
-        break;
-      case 'v':
-        ResultChar = 11;
-        break;
-        
-      //case 'u': case 'U':  // FIXME: UCNs.
-      case 'x': // Hex escape.
-        if (ThisTokBuf == ThisTokEnd ||
-            (ResultChar = HexDigitValue(*ThisTokBuf)) == ~0U) {
-          Diag(StringToks[i].getLocation(), diag::err_hex_escape_no_digits);
-          ResultChar = 0;
-          break;
-        }
-        ++ThisTokBuf; // Consumed one hex digit.
-        
-        assert(0 && "hex escape: unimp!");
-        break;
-      case '0': case '1': case '2': case '3':
-      case '4': case '5': case '6': case '7':
-        // Octal escapes.
-        assert(0 && "octal escape: unimp!");
-        break;
-        
-      // Otherwise, these are not valid escapes.
-      case '(': case '{': case '[': case '%':
-        // GCC accepts these as extensions.  We warn about them as such though.
-        if (!PP.getLangOptions().NoExtensions) {
-          Diag(StringToks[i].getLocation(), diag::ext_nonstandard_escape,
-               std::string()+(char)ResultChar);
-          break;
-        }
-        // FALL THROUGH.
-      default:
-        if (isgraph(ThisTokBuf[0])) {
-          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
-               std::string()+(char)ResultChar);
-        } else {
-          Diag(StringToks[i].getLocation(), diag::ext_unknown_escape,
-               "x"+utohexstr(ResultChar));
-        }
-      }
-
-      // Note: our internal rep of wide char tokens is always little-endian.
-      *ResultPtr++ = ResultChar & 0xFF;
-      
-      if (AnyWide) {
-        for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-          *ResultPtr++ = ResultChar >> i*8;
-      }
-    }
-  }
-  
-  // Add zero terminator.
-  *ResultPtr = 0;
-  if (AnyWide) {
-    for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
-      *ResultPtr++ = 0;
-  }
-  
+  StringLiteralParser Literal(StringToks, NumStringToks, PP, Context.Target);
+  if (Literal.hadError)
+    return ExprResult(true);
+
   SmallVector<SourceLocation, 4> StringTokLocs;
   for (unsigned i = 0; i != NumStringToks; ++i)
     StringTokLocs.push_back(StringToks[i].getLocation());
-  
+    
   // FIXME: use factory.
-  
   // Pass &StringTokLocs[0], StringTokLocs.size() to factory!
-  return new StringLiteral(&ResultBuf[0], ResultPtr-&ResultBuf[0], AnyWide);
+  return new StringLiteral(Literal.GetString(), Literal.GetStringLength(), 
+                           Literal.AnyWide);
 }
 
 

Modified: cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h?rev=39354&r1=39353&r2=39354&view=diff

==============================================================================
--- cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h (original)
+++ cfe/cfe/trunk/include/clang/Lex/LiteralSupport.h Wed Jul 11 11:43:31 2007
@@ -15,12 +15,14 @@
 #define LLVM_CLANG_LITERALSUPPORT_H
 
 #include <string>
+#include "llvm/ADT/SmallString.h"
 
 namespace llvm {
 namespace clang {
 
 class Diagnostic;
 class Preprocessor;
+class LexerToken;
 class SourceLocation;
 class TargetInfo;
     
@@ -91,6 +93,37 @@
     return ptr;
   }
 };
+
+class StringLiteralParser {
+  Preprocessor &PP;
+  TargetInfo &Target;
+  
+  unsigned MaxTokenLength;
+  unsigned SizeBound;
+  unsigned wchar_tByteWidth;
+  SmallString<512> ResultBuf;
+  char *ResultPtr; // cursor
+public:
+  StringLiteralParser(const LexerToken *StringToks, unsigned NumStringToks,
+                      Preprocessor &PP, TargetInfo &T);
+  bool hadError;
+  bool AnyWide;
+  
+  const char *GetString() { return &ResultBuf[0]; }
+  unsigned GetStringLength() { return ResultPtr-&ResultBuf[0]; }
+private:
+  void Diag(SourceLocation Loc, unsigned DiagID, 
+            const std::string &M = std::string());
+
+  /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
+  /// not valid.
+  static int HexDigitValue(char C) {
+    if (C >= '0' && C <= '9') return C-'0';
+    if (C >= 'a' && C <= 'f') return C-'a'+10;
+    if (C >= 'A' && C <= 'F') return C-'A'+10;
+    return -1;
+  }
+};
   
 }  // end namespace clang
 }  // end namespace llvm





More information about the cfe-commits mailing list