[cfe-commits] r64418 - in /cfe/trunk: Driver/PrintPreprocessedOutput.cpp clang.xcodeproj/project.pbxproj include/clang/Lex/TokenConcatenation.h lib/Lex/TokenConcatenation.cpp

Thu Feb 12 16:46:04 PST 2009

Author: lattner
Date: Thu Feb 12 18:46:04 2009
New Revision: 64418

URL: http://llvm.org/viewvc/llvm-project?rev=64418&view=rev
Log:
factor token concatenation avoidance logic out of 
PrintPreprocessedOutput into its own file.  No functionality change.

Added:
    cfe/trunk/include/clang/Lex/TokenConcatenation.h
    cfe/trunk/lib/Lex/TokenConcatenation.cpp
Modified:
    cfe/trunk/Driver/PrintPreprocessedOutput.cpp
    cfe/trunk/clang.xcodeproj/project.pbxproj

Modified: cfe/trunk/Driver/PrintPreprocessedOutput.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/Driver/PrintPreprocessedOutput.cpp?rev=64418&r1=64417&r2=64418&view=diff

==============================================================================

--- cfe/trunk/Driver/PrintPreprocessedOutput.cpp (original)
+++ cfe/trunk/Driver/PrintPreprocessedOutput.cpp Thu Feb 12 18:46:04 2009
@@ -12,11 +12,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Basic/SourceManager.h"
 #include "clang.h"
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/Pragma.h"
+#include "clang/Lex/TokenConcatenation.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Diagnostic.h"
 #include "llvm/ADT/SmallString.h"
@@ -48,6 +50,7 @@
 namespace {
 class PrintPPOutputPPCallbacks : public PPCallbacks {
   Preprocessor &PP;
+  TokenConcatenation ConcatInfo;
 public:
   llvm::raw_ostream &OS;
 private:
@@ -58,7 +61,7 @@
   bool Initialized;
 public:
   PrintPPOutputPPCallbacks(Preprocessor &pp, llvm::raw_ostream &os)
-     : PP(pp), OS(os) {
+     : PP(pp), ConcatInfo(PP), OS(os) {
     CurLine = 0;
     CurFilename += "<uninit>";
     EmittedTokensOnThisLine = false;
@@ -78,7 +81,9 @@
 
   bool HandleFirstTokOnLine(Token &Tok);
   bool MoveToLine(SourceLocation Loc);
-  bool AvoidConcat(const Token &PrevTok, const Token &Tok);
+  bool AvoidConcat(const Token &PrevTok, const Token &Tok) {
+    return ConcatInfo.AvoidConcat(PrevTok, Tok);
+  }
   void WriteLineInfo(unsigned LineNo, const char *Extra=0, unsigned ExtraLen=0);
 };
 }  // end anonymous namespace
@@ -291,222 +296,6 @@
 } // end anonymous namespace
 
 
-enum AvoidConcatInfo {
-  /// By default, a token never needs to avoid concatenation.  Most tokens (e.g.
-  /// ',', ')', etc) don't cause a problem when concatenated.
-  aci_never_avoid_concat = 0,
-
-  /// aci_custom_firstchar - AvoidConcat contains custom code to handle this
-  /// token's requirements, and it needs to know the first character of the
-  /// token.
-  aci_custom_firstchar = 1,
-
-  /// aci_custom - AvoidConcat contains custom code to handle this token's
-  /// requirements, but it doesn't need to know the first character of the
-  /// token.
-  aci_custom = 2,
-  
-  /// aci_avoid_equal - Many tokens cannot be safely followed by an '='
-  /// character.  For example, "<<" turns into "<<=" when followed by an =.
-  aci_avoid_equal = 4
-};
-
-/// This array contains information for each token on what action to take when
-/// avoiding concatenation of tokens in the AvoidConcat method.
-static char TokenInfo[tok::NUM_TOKENS];
-
-/// InitAvoidConcatTokenInfo - Tokens that must avoid concatenation should be
-/// marked by this function.
-static void InitAvoidConcatTokenInfo() {
-  // These tokens have custom code in AvoidConcat.
-  TokenInfo[tok::identifier      ] |= aci_custom;
-  TokenInfo[tok::numeric_constant] |= aci_custom_firstchar;
-  TokenInfo[tok::period          ] |= aci_custom_firstchar;
-  TokenInfo[tok::amp             ] |= aci_custom_firstchar;
-  TokenInfo[tok::plus            ] |= aci_custom_firstchar;
-  TokenInfo[tok::minus           ] |= aci_custom_firstchar;
-  TokenInfo[tok::slash           ] |= aci_custom_firstchar;
-  TokenInfo[tok::less            ] |= aci_custom_firstchar;
-  TokenInfo[tok::greater         ] |= aci_custom_firstchar;
-  TokenInfo[tok::pipe            ] |= aci_custom_firstchar;
-  TokenInfo[tok::percent         ] |= aci_custom_firstchar;
-  TokenInfo[tok::colon           ] |= aci_custom_firstchar;
-  TokenInfo[tok::hash            ] |= aci_custom_firstchar;
-  TokenInfo[tok::arrow           ] |= aci_custom_firstchar;
-  
-  // These tokens change behavior if followed by an '='.
-  TokenInfo[tok::amp         ] |= aci_avoid_equal;           // &=
-  TokenInfo[tok::plus        ] |= aci_avoid_equal;           // +=
-  TokenInfo[tok::minus       ] |= aci_avoid_equal;           // -=
-  TokenInfo[tok::slash       ] |= aci_avoid_equal;           // /=
-  TokenInfo[tok::less        ] |= aci_avoid_equal;           // <=
-  TokenInfo[tok::greater     ] |= aci_avoid_equal;           // >=
-  TokenInfo[tok::pipe        ] |= aci_avoid_equal;           // |=
-  TokenInfo[tok::percent     ] |= aci_avoid_equal;           // %=
-  TokenInfo[tok::star        ] |= aci_avoid_equal;           // *=
-  TokenInfo[tok::exclaim     ] |= aci_avoid_equal;           // !=
-  TokenInfo[tok::lessless    ] |= aci_avoid_equal;           // <<=
-  TokenInfo[tok::greaterequal] |= aci_avoid_equal;           // >>=
-  TokenInfo[tok::caret       ] |= aci_avoid_equal;           // ^=
-  TokenInfo[tok::equal       ] |= aci_avoid_equal;           // ==
-}
-
-/// StartsWithL - Return true if the spelling of this token starts with 'L'.
-static bool StartsWithL(const Token &Tok, Preprocessor &PP) {
-  if (!Tok.needsCleaning()) {
-    SourceManager &SrcMgr = PP.getSourceManager();
-    return *SrcMgr.getCharacterData(SrcMgr.getSpellingLoc(Tok.getLocation()))
-               == 'L';
-  }
-  
-  if (Tok.getLength() < 256) {
-    char Buffer[256];
-    const char *TokPtr = Buffer;
-    PP.getSpelling(Tok, TokPtr);
-    return TokPtr[0] == 'L';
-  }
-
-  return PP.getSpelling(Tok)[0] == 'L';
-}
-
-/// IsIdentifierL - Return true if the spelling of this token is literally 'L'.
-static bool IsIdentifierL(const Token &Tok, Preprocessor &PP) {
-  if (!Tok.needsCleaning()) {
-    if (Tok.getLength() != 1)
-      return false;
-    SourceManager &SrcMgr = PP.getSourceManager();
-    return *SrcMgr.getCharacterData(SrcMgr.getSpellingLoc(Tok.getLocation()))
-               == 'L';
-  }
-  
-  if (Tok.getLength() < 256) {
-    char Buffer[256];
-    const char *TokPtr = Buffer;
-    if (PP.getSpelling(Tok, TokPtr) != 1) 
-      return false;
-    return TokPtr[0] == 'L';
-  }
-  
-  return PP.getSpelling(Tok) == "L";
-}
-
-
-/// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
-/// the two individual tokens to be lexed as a single token, return true (which
-/// causes a space to be printed between them).  This allows the output of -E
-/// mode to be lexed to the same token stream as lexing the input directly
-/// would.
-///
-/// This code must conservatively return true if it doesn't want to be 100%
-/// accurate.  This will cause the output to include extra space characters, but
-/// the resulting output won't have incorrect concatenations going on.  Examples
-/// include "..", which we print with a space between, because we don't want to
-/// track enough to tell "x.." from "...".
-bool PrintPPOutputPPCallbacks::AvoidConcat(const Token &PrevTok,
-                                           const Token &Tok) {
-  char Buffer[256];
-  
-  tok::TokenKind PrevKind = PrevTok.getKind();
-  if (PrevTok.getIdentifierInfo())  // Language keyword or named operator.
-    PrevKind = tok::identifier;
- 
-  // Look up information on when we should avoid concatenation with prevtok.
-  unsigned ConcatInfo = TokenInfo[PrevKind];
-  
-  // If prevtok never causes a problem for anything after it, return quickly.
-  if (ConcatInfo == 0) return false;
-
-  if (ConcatInfo & aci_avoid_equal) {
-    // If the next token is '=' or '==', avoid concatenation.
-    if (Tok.is(tok::equal) || Tok.is(tok::equalequal))
-      return true;
-    ConcatInfo &= ~aci_avoid_equal;
-  }
-  
-  if (ConcatInfo == 0) return false;
-
-  
-  
-  // Basic algorithm: we look at the first character of the second token, and
-  // determine whether it, if appended to the first token, would form (or would
-  // contribute) to a larger token if concatenated.
-  char FirstChar = 0;
-  if (ConcatInfo & aci_custom) {
-    // If the token does not need to know the first character, don't get it.
-  } else if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
-    // Avoid spelling identifiers, the most common form of token.
-    FirstChar = II->getName()[0];
-  } else if (!Tok.needsCleaning()) {
-    if (Tok.isLiteral() && Tok.getLiteralData()) {
-      FirstChar = *Tok.getLiteralData();
-    } else {
-      SourceManager &SrcMgr = PP.getSourceManager();
-      FirstChar =
-        *SrcMgr.getCharacterData(SrcMgr.getSpellingLoc(Tok.getLocation()));
-    }
-  } else if (Tok.getLength() < 256) {
-    const char *TokPtr = Buffer;
-    PP.getSpelling(Tok, TokPtr);
-    FirstChar = TokPtr[0];
-  } else {
-    FirstChar = PP.getSpelling(Tok)[0];
-  }
- 
-  switch (PrevKind) {
-  default: assert(0 && "InitAvoidConcatTokenInfo built wrong");
-  case tok::identifier:   // id+id or id+number or id+L"foo".
-    if (Tok.is(tok::numeric_constant) || Tok.getIdentifierInfo() ||
-        Tok.is(tok::wide_string_literal) /* ||
-        Tok.is(tok::wide_char_literal)*/)
-      return true;
-
-    // If this isn't identifier + string, we're done.
-    if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
-      return false;
-      
-    // FIXME: need a wide_char_constant!
-
-    // If the string was a wide string L"foo" or wide char L'f', it would concat
-    // with the previous identifier into fooL"bar".  Avoid this.
-    if (StartsWithL(Tok, PP))
-      return true;
-
-    // Otherwise, this is a narrow character or string.  If the *identifier* is
-    // a literal 'L', avoid pasting L "foo" -> L"foo".
-    return IsIdentifierL(PrevTok, PP);
-  case tok::numeric_constant:
-    return isalnum(FirstChar) || Tok.is(tok::numeric_constant) ||
-           FirstChar == '+' || FirstChar == '-' || FirstChar == '.';
-  case tok::period:          // ..., .*, .1234
-    return FirstChar == '.' || isdigit(FirstChar) ||
-           (FirstChar == '*' && PP.getLangOptions().CPlusPlus);
-  case tok::amp:             // &&
-    return FirstChar == '&';
-  case tok::plus:            // ++
-    return FirstChar == '+';
-  case tok::minus:           // --, ->, ->*
-    return FirstChar == '-' || FirstChar == '>';
-  case tok::slash:           //, /*, //
-    return FirstChar == '*' || FirstChar == '/';
-  case tok::less:            // <<, <<=, <:, <%
-    return FirstChar == '<' || FirstChar == ':' || FirstChar == '%';
-  case tok::greater:         // >>, >>=
-    return FirstChar == '>';
-  case tok::pipe:            // ||
-    return FirstChar == '|';
-  case tok::percent:         // %>, %:
-    return (FirstChar == '>' || FirstChar == ':') &&
-           PP.getLangOptions().Digraphs;
-  case tok::colon:           // ::, :>
-    return (FirstChar == ':' && PP.getLangOptions().CPlusPlus) ||
-           (FirstChar == '>' && PP.getLangOptions().Digraphs);
-  case tok::hash:            // ##, #@, %:%:
-    return FirstChar == '#' || FirstChar == '@' || FirstChar == '%';
-  case tok::arrow:           // ->*
-    return FirstChar == '*';
-  }
-}
-
 static void PrintPreprocessedTokens(Preprocessor &PP, Token &Tok,
                                     PrintPPOutputPPCallbacks *Callbacks,
                                     llvm::raw_ostream &OS) {
@@ -614,8 +403,6 @@
   // Inform the preprocessor whether we want it to retain comments or not, due
   // to -C or -CC.
   PP.SetCommentRetentionState(EnableCommentOutput, EnableMacroCommentOutput);
-  InitAvoidConcatTokenInfo();
-
   
   // Open the output buffer.
   std::string Err;
@@ -646,8 +433,8 @@
       PrintMacroDefinition(*MacrosByID[i].first, *MacrosByID[i].second, PP, OS);
     
   } else {
-    PrintPPOutputPPCallbacks *Callbacks;
-    Callbacks = new PrintPPOutputPPCallbacks(PP, OS);
+    PrintPPOutputPPCallbacks *Callbacks
+      = new PrintPPOutputPPCallbacks(PP, OS);
     PP.AddPragmaHandler(0, new UnknownPragmaHandler("#pragma", Callbacks));
     PP.AddPragmaHandler("GCC", new UnknownPragmaHandler("#pragma GCC",
                                                         Callbacks));

Modified: cfe/trunk/clang.xcodeproj/project.pbxproj
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/clang.xcodeproj/project.pbxproj?rev=64418&r1=64417&r2=64418&view=diff

==============================================================================
--- cfe/trunk/clang.xcodeproj/project.pbxproj (original)
+++ cfe/trunk/clang.xcodeproj/project.pbxproj Thu Feb 12 18:46:04 2009
@@ -177,6 +177,7 @@
 		DEAEE98B0A5A2B970045101B /* MultipleIncludeOpt.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = DEAEE98A0A5A2B970045101B /* MultipleIncludeOpt.h */; };
 		DEAEED4B0A5AF89A0045101B /* NOTES.txt in CopyFiles */ = {isa = PBXBuildFile; fileRef = DEAEED4A0A5AF89A0045101B /* NOTES.txt */; };
 		DEB076CF0F3A222200F5A2BE /* DeclTemplate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = DEB076CE0F3A222200F5A2BE /* DeclTemplate.cpp */; };
+		DEB077990F44F97800F5A2BE /* TokenConcatenation.cpp in Sources */ = {isa = PBXBuildFile; fileRef = DEB077980F44F97800F5A2BE /* TokenConcatenation.cpp */; };
 		DEC63B1A0C7B940200DBF169 /* CFG.cpp in Sources */ = {isa = PBXBuildFile; fileRef = DEC63B190C7B940200DBF169 /* CFG.cpp */; };
 		DEC63B1C0C7B940600DBF169 /* CFG.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = DEC63B1B0C7B940600DBF169 /* CFG.h */; };
 		DEC8D9910A9433CD00353FCA /* Decl.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = DEC8D9900A9433CD00353FCA /* Decl.h */; };
@@ -551,6 +552,8 @@
 		DEAEED4A0A5AF89A0045101B /* NOTES.txt */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = text; path = NOTES.txt; sourceTree = "<group>"; };
 		DEB076C90F3A221200F5A2BE /* DeclTemplate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = DeclTemplate.h; path = clang/AST/DeclTemplate.h; sourceTree = "<group>"; };
 		DEB076CE0F3A222200F5A2BE /* DeclTemplate.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = DeclTemplate.cpp; path = lib/AST/DeclTemplate.cpp; sourceTree = "<group>"; };
+		DEB077930F44F96000F5A2BE /* TokenConcatenation.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TokenConcatenation.h; sourceTree = "<group>"; };
+		DEB077980F44F97800F5A2BE /* TokenConcatenation.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = TokenConcatenation.cpp; sourceTree = "<group>"; };
 		DEB089EE0F12F1D900522C07 /* TypeTraits.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = TypeTraits.h; sourceTree = "<group>"; };
 		DEC63B190C7B940200DBF169 /* CFG.cpp */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 2; lastKnownFileType = sourcecode.cpp.cpp; name = CFG.cpp; path = lib/AST/CFG.cpp; sourceTree = "<group>"; tabWidth = 2; };
 		DEC63B1B0C7B940600DBF169 /* CFG.h */ = {isa = PBXFileReference; fileEncoding = 30; indentWidth = 2; lastKnownFileType = sourcecode.c.h; name = CFG.h; path = clang/AST/CFG.h; sourceTree = "<group>"; tabWidth = 2; };
@@ -1119,6 +1122,7 @@
 				35B820740ECB811A0020BEC0 /* PreprocessorLexer.h */,
 				DED7D9170A52518C003AD0FB /* ScratchBuffer.h */,
 				DE6954630C5121BD00A5826B /* Token.h */,
+				DEB077930F44F96000F5A2BE /* TokenConcatenation.h */,
 				DE85CD840D8380F20070E26E /* TokenLexer.h */,
 			);
 			name = Lex;
@@ -1162,6 +1166,7 @@
 				3537AA0D0ECD08A4008F7CDC /* PreprocessorLexer.cpp */,
 				35E1946C0ECB83C100F21733 /* PTHLexer.cpp */,
 				DED7D9E40A5257F6003AD0FB /* ScratchBuffer.cpp */,
+				DEB077980F44F97800F5A2BE /* TokenConcatenation.cpp */,
 				DE85CD800D8380B10070E26E /* TokenLexer.cpp */,
 			);
 			name = Lex;
@@ -1395,6 +1400,7 @@
 				357EA27D0F2526F300439B60 /* SemaLookup.cpp in Sources */,
 				DEB076CF0F3A222200F5A2BE /* DeclTemplate.cpp in Sources */,
 				1A471AB50F437BC500753CE8 /* CGBlocks.cpp in Sources */,
+				DEB077990F44F97800F5A2BE /* TokenConcatenation.cpp in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};

Added: cfe/trunk/include/clang/Lex/TokenConcatenation.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/TokenConcatenation.h?rev=64418&view=auto

==============================================================================
--- cfe/trunk/include/clang/Lex/TokenConcatenation.h (added)
+++ cfe/trunk/include/clang/Lex/TokenConcatenation.h Thu Feb 12 18:46:04 2009
@@ -0,0 +1,73 @@
+//===--- TokenConcatenation.h - Token Concatenation Avoidance ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the TokenConcatenation class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LEX_TOKEN_CONCATENATION_H
+#define CLANG_LEX_TOKEN_CONCATENATION_H
+
+#include "clang/Basic/TokenKinds.h"
+
+namespace clang {
+  class Preprocessor;
+  class Token;
+  
+  /// TokenConcatenation class, which answers the question of
+  ///   "Is it safe to emit two tokens without a whitespace between them, or
+  ///    would that cause implicit concatenation of the tokens?"
+  ///
+  /// For example, it emitting two identifiers "foo" and "bar" next to each
+  /// other would cause the lexer to produce one "foobar" token.  Emitting "1"
+  /// and ")" next to each other is safe.
+  ///
+  class TokenConcatenation {
+    Preprocessor &PP;
+    
+    enum AvoidConcatInfo {
+      /// By default, a token never needs to avoid concatenation.  Most tokens
+      /// (e.g. ',', ')', etc) don't cause a problem when concatenated.
+      aci_never_avoid_concat = 0,
+      
+      /// aci_custom_firstchar - AvoidConcat contains custom code to handle this
+      /// token's requirements, and it needs to know the first character of the
+      /// token.
+      aci_custom_firstchar = 1,
+      
+      /// aci_custom - AvoidConcat contains custom code to handle this token's
+      /// requirements, but it doesn't need to know the first character of the
+      /// token.
+      aci_custom = 2,
+      
+      /// aci_avoid_equal - Many tokens cannot be safely followed by an '='
+      /// character.  For example, "<<" turns into "<<=" when followed by an =.
+      aci_avoid_equal = 4
+    };
+    
+    /// TokenInfo - This array contains information for each token on what
+    /// action to take when avoiding concatenation of tokens in the AvoidConcat
+    /// method.
+    char TokenInfo[tok::NUM_TOKENS];
+  public:
+    TokenConcatenation(Preprocessor &PP);
+    
+    bool AvoidConcat(const Token &PrevTok, const Token &Tok) const;
+
+  private:
+    /// StartsWithL - Return true if the spelling of this token starts with 'L'.
+    bool StartsWithL(const Token &Tok) const;
+    
+    /// IsIdentifierL - Return true if the spelling of this token is literally
+    /// 'L'.
+    bool IsIdentifierL(const Token &Tok) const;
+  };
+  } // end clang namespace
+
+#endif

Added: cfe/trunk/lib/Lex/TokenConcatenation.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/TokenConcatenation.cpp?rev=64418&view=auto

==============================================================================
--- cfe/trunk/lib/Lex/TokenConcatenation.cpp (added)
+++ cfe/trunk/lib/Lex/TokenConcatenation.cpp Thu Feb 12 18:46:04 2009
@@ -0,0 +1,205 @@
+//===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TokenConcatenation class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/TokenConcatenation.h"
+#include "clang/Lex/Preprocessor.h"
+using namespace clang; 
+
+
+/// StartsWithL - Return true if the spelling of this token starts with 'L'.
+bool TokenConcatenation::StartsWithL(const Token &Tok) const {
+  if (!Tok.needsCleaning()) {
+    SourceManager &SM = PP.getSourceManager();
+    return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L';
+  }
+  
+  if (Tok.getLength() < 256) {
+    char Buffer[256];
+    const char *TokPtr = Buffer;
+    PP.getSpelling(Tok, TokPtr);
+    return TokPtr[0] == 'L';
+  }
+  
+  return PP.getSpelling(Tok)[0] == 'L';
+}
+
+/// IsIdentifierL - Return true if the spelling of this token is literally
+/// 'L'.
+bool TokenConcatenation::IsIdentifierL(const Token &Tok) const {
+  if (!Tok.needsCleaning()) {
+    if (Tok.getLength() != 1)
+      return false;
+    SourceManager &SM = PP.getSourceManager();
+    return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())) == 'L';
+  }
+  
+  if (Tok.getLength() < 256) {
+    char Buffer[256];
+    const char *TokPtr = Buffer;
+    if (PP.getSpelling(Tok, TokPtr) != 1) 
+      return false;
+    return TokPtr[0] == 'L';
+  }
+  
+  return PP.getSpelling(Tok) == "L";
+}
+
+TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) {
+  memset(TokenInfo, 0, sizeof(TokenInfo));
+  
+  // These tokens have custom code in AvoidConcat.
+  TokenInfo[tok::identifier      ] |= aci_custom;
+  TokenInfo[tok::numeric_constant] |= aci_custom_firstchar;
+  TokenInfo[tok::period          ] |= aci_custom_firstchar;
+  TokenInfo[tok::amp             ] |= aci_custom_firstchar;
+  TokenInfo[tok::plus            ] |= aci_custom_firstchar;
+  TokenInfo[tok::minus           ] |= aci_custom_firstchar;
+  TokenInfo[tok::slash           ] |= aci_custom_firstchar;
+  TokenInfo[tok::less            ] |= aci_custom_firstchar;
+  TokenInfo[tok::greater         ] |= aci_custom_firstchar;
+  TokenInfo[tok::pipe            ] |= aci_custom_firstchar;
+  TokenInfo[tok::percent         ] |= aci_custom_firstchar;
+  TokenInfo[tok::colon           ] |= aci_custom_firstchar;
+  TokenInfo[tok::hash            ] |= aci_custom_firstchar;
+  TokenInfo[tok::arrow           ] |= aci_custom_firstchar;
+  
+  // These tokens change behavior if followed by an '='.
+  TokenInfo[tok::amp         ] |= aci_avoid_equal;           // &=
+  TokenInfo[tok::plus        ] |= aci_avoid_equal;           // +=
+  TokenInfo[tok::minus       ] |= aci_avoid_equal;           // -=
+  TokenInfo[tok::slash       ] |= aci_avoid_equal;           // /=
+  TokenInfo[tok::less        ] |= aci_avoid_equal;           // <=
+  TokenInfo[tok::greater     ] |= aci_avoid_equal;           // >=
+  TokenInfo[tok::pipe        ] |= aci_avoid_equal;           // |=
+  TokenInfo[tok::percent     ] |= aci_avoid_equal;           // %=
+  TokenInfo[tok::star        ] |= aci_avoid_equal;           // *=
+  TokenInfo[tok::exclaim     ] |= aci_avoid_equal;           // !=
+  TokenInfo[tok::lessless    ] |= aci_avoid_equal;           // <<=
+  TokenInfo[tok::greaterequal] |= aci_avoid_equal;           // >>=
+  TokenInfo[tok::caret       ] |= aci_avoid_equal;           // ^=
+  TokenInfo[tok::equal       ] |= aci_avoid_equal;           // ==
+}
+
+/// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
+/// the two individual tokens to be lexed as a single token, return true
+/// (which causes a space to be printed between them).  This allows the output
+/// of -E mode to be lexed to the same token stream as lexing the input
+/// directly would.
+///
+/// This code must conservatively return true if it doesn't want to be 100%
+/// accurate.  This will cause the output to include extra space characters,
+/// but the resulting output won't have incorrect concatenations going on.
+/// Examples include "..", which we print with a space between, because we
+/// don't want to track enough to tell "x.." from "...".
+bool TokenConcatenation::AvoidConcat(const Token &PrevTok,
+                                     const Token &Tok) const {
+  char Buffer[256];
+  
+  tok::TokenKind PrevKind = PrevTok.getKind();
+  if (PrevTok.getIdentifierInfo())  // Language keyword or named operator.
+    PrevKind = tok::identifier;
+  
+  // Look up information on when we should avoid concatenation with prevtok.
+  unsigned ConcatInfo = TokenInfo[PrevKind];
+  
+  // If prevtok never causes a problem for anything after it, return quickly.
+  if (ConcatInfo == 0) return false;
+  
+  if (ConcatInfo & aci_avoid_equal) {
+    // If the next token is '=' or '==', avoid concatenation.
+    if (Tok.is(tok::equal) || Tok.is(tok::equalequal))
+      return true;
+    ConcatInfo &= ~aci_avoid_equal;
+  }
+  
+  if (ConcatInfo == 0) return false;
+  
+  // Basic algorithm: we look at the first character of the second token, and
+  // determine whether it, if appended to the first token, would form (or
+  // would contribute) to a larger token if concatenated.
+  char FirstChar = 0;
+  if (ConcatInfo & aci_custom) {
+    // If the token does not need to know the first character, don't get it.
+  } else if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
+    // Avoid spelling identifiers, the most common form of token.
+    FirstChar = II->getName()[0];
+  } else if (!Tok.needsCleaning()) {
+    if (Tok.isLiteral() && Tok.getLiteralData()) {
+      FirstChar = *Tok.getLiteralData();
+    } else {
+      SourceManager &SrcMgr = PP.getSourceManager();
+      FirstChar =
+      *SrcMgr.getCharacterData(SrcMgr.getSpellingLoc(Tok.getLocation()));
+    }
+  } else if (Tok.getLength() < 256) {
+    const char *TokPtr = Buffer;
+    PP.getSpelling(Tok, TokPtr);
+    FirstChar = TokPtr[0];
+  } else {
+    FirstChar = PP.getSpelling(Tok)[0];
+  }
+  
+  switch (PrevKind) {
+  default: assert(0 && "InitAvoidConcatTokenInfo built wrong");
+  case tok::identifier:   // id+id or id+number or id+L"foo".
+    if (Tok.is(tok::numeric_constant) || Tok.getIdentifierInfo() ||
+        Tok.is(tok::wide_string_literal) /* ||
+     Tok.is(tok::wide_char_literal)*/)
+      return true;
+    
+    // If this isn't identifier + string, we're done.
+    if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
+      return false;
+    
+    // FIXME: need a wide_char_constant!
+    
+    // If the string was a wide string L"foo" or wide char L'f', it would
+    // concat with the previous identifier into fooL"bar".  Avoid this.
+    if (StartsWithL(Tok))
+      return true;
+    
+    // Otherwise, this is a narrow character or string.  If the *identifier*
+    // is a literal 'L', avoid pasting L "foo" -> L"foo".
+    return IsIdentifierL(PrevTok);
+  case tok::numeric_constant:
+    return isalnum(FirstChar) || Tok.is(tok::numeric_constant) ||
+    FirstChar == '+' || FirstChar == '-' || FirstChar == '.';
+  case tok::period:          // ..., .*, .1234
+    return FirstChar == '.' || isdigit(FirstChar) ||
+    (FirstChar == '*' && PP.getLangOptions().CPlusPlus);
+  case tok::amp:             // &&
+    return FirstChar == '&';
+  case tok::plus:            // ++
+    return FirstChar == '+';
+  case tok::minus:           // --, ->, ->*
+    return FirstChar == '-' || FirstChar == '>';
+  case tok::slash:           //, /*, //
+    return FirstChar == '*' || FirstChar == '/';
+  case tok::less:            // <<, <<=, <:, <%
+    return FirstChar == '<' || FirstChar == ':' || FirstChar == '%';
+  case tok::greater:         // >>, >>=
+    return FirstChar == '>';
+  case tok::pipe:            // ||
+    return FirstChar == '|';
+  case tok::percent:         // %>, %:
+    return (FirstChar == '>' || FirstChar == ':') &&
+    PP.getLangOptions().Digraphs;
+  case tok::colon:           // ::, :>
+    return (FirstChar == ':' && PP.getLangOptions().CPlusPlus) ||
+    (FirstChar == '>' && PP.getLangOptions().Digraphs);
+  case tok::hash:            // ##, #@, %:%:
+    return FirstChar == '#' || FirstChar == '@' || FirstChar == '%';
+  case tok::arrow:           // ->*
+    return FirstChar == '*';
+  }
+}