[cfe-commits] r64326 - in /cfe/trunk: Driver/CacheTokens.cpp include/clang/Basic/IdentifierTable.h include/clang/Lex/PTHManager.h lib/Lex/PTHLexer.cpp

Ted Kremenek kremenek at apple.com
Wed Feb 11 13:29:17 PST 2009


Author: kremenek
Date: Wed Feb 11 15:29:16 2009
New Revision: 64326

URL: http://llvm.org/viewvc/llvm-project?rev=64326&view=rev
Log:
PTH: Replace string identifier to persistent ID lookup with a hashtable. This is
actually *slightly* slower than the binary search. Since this is algorithmically
better, further performance tuning should be able to make this faster.

Modified:
    cfe/trunk/Driver/CacheTokens.cpp
    cfe/trunk/include/clang/Basic/IdentifierTable.h
    cfe/trunk/include/clang/Lex/PTHManager.h
    cfe/trunk/lib/Lex/PTHLexer.cpp

Modified: cfe/trunk/Driver/CacheTokens.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/Driver/CacheTokens.cpp?rev=64326&r1=64325&r2=64326&view=diff

==============================================================================
--- cfe/trunk/Driver/CacheTokens.cpp (original)
+++ cfe/trunk/Driver/CacheTokens.cpp Wed Feb 11 15:29:16 2009
@@ -53,6 +53,16 @@
   for ( ; n ; --n ) Emit8(Out, 0);
 }
 
+// Bernstein hash function:
+// This is basically copy-and-paste from StringMap.  This likely won't
+// stay here, which is why I didn't both to expose this function from
+// String Map.
+static unsigned BernsteinHash(const char* x) {  
+  unsigned int R = 0;
+  for ( ; *x != '\0' ; ++x) R = R * 33 + *x;
+  return R + (R >> 5);
+}
+
 //===----------------------------------------------------------------------===//
 // On Disk Hashtable Logic.  This will eventually get refactored and put
 // elsewhere.
@@ -193,14 +203,7 @@
   typedef const PCHEntry& data_type_ref;
   
   static unsigned ComputeHash(const FileEntry* FE) {
-    // Bernstein hash function:
-    // This is basically copy-and-paste from StringMap.  This likely won't
-    // stay here, which is why I didn't both to expose this function from
-    // String Map.  There are plenty of other hash functions which are likely
-    // to perform better and be faster.
-    unsigned int R = 0;
-    for (const char* x = FE->getName(); *x != '\0' ; ++x) R = R * 33 + *x;
-    return R + (R >> 5);
+    return BernsteinHash(FE->getName());
   }
   
   static std::pair<unsigned,unsigned> 
@@ -273,6 +276,10 @@
     for ( ; I != E ; ++I) Out << *I;
   }
   
+  /// EmitIdentifierTable - Emits two tables to the PTH file.  The first is
+  ///  a hashtable mapping from identifier strings to persistent IDs.
+  ///  The second is a straight table mapping from persistent IDs to string data
+  ///  (the keys of the first table).
   std::pair<Offset, Offset> EmitIdentifierTable();
   
   /// EmitFileTable - Emit a table mapping from file name strings to PTH
@@ -335,83 +342,6 @@
   Emit32(PP.getSourceManager().getFileOffset(T.getLocation()));
 }
 
-namespace {
-struct VISIBILITY_HIDDEN IDData {
-  const IdentifierInfo* II;
-  uint32_t FileOffset;
-};
-  
-class VISIBILITY_HIDDEN CompareIDDataIndex {
-  IDData* Table;
-public:  
-  CompareIDDataIndex(IDData* table) : Table(table) {}
-
-  bool operator()(unsigned i, unsigned j) const {
-    const IdentifierInfo* II_i = Table[i].II;
-    const IdentifierInfo* II_j = Table[j].II;
-
-    unsigned i_len = II_i->getLength();
-    unsigned j_len = II_j->getLength();
-    
-    if (i_len > j_len)
-      return false;
-    
-    if (i_len < j_len)
-      return true;
-
-    // Otherwise, compare the strings themselves!
-    return strncmp(II_i->getName(), II_j->getName(), i_len) < 0;    
-  }
-};
-}
-
-std::pair<Offset,Offset> PTHWriter::EmitIdentifierTable() {  
-  llvm::BumpPtrAllocator Alloc;
-
-  // Build an inverse map from persistent IDs -> IdentifierInfo*.
-  IDData* IIDMap = Alloc.Allocate<IDData>(idcount);
-  
-  // Generate mapping from persistent IDs -> IdentifierInfo*.
-  for (IDMap::iterator I=IM.begin(), E=IM.end(); I!=E; ++I) {
-    // Decrement by 1 because we are using a vector for the lookup and
-    // 0 is reserved for NULL.
-    assert(I->second > 0);
-    assert(I->second-1 < idcount);
-    unsigned idx = I->second-1;
-    IIDMap[idx].II = I->first;
-  }  
-  
-  // We want to write out the strings in lexical order to support binary
-  // search of strings to identifiers.  Create such a table.
-  unsigned *LexicalOrder = Alloc.Allocate<unsigned>(idcount);
-  for (unsigned i = 0; i < idcount ; ++i ) LexicalOrder[i] = i;
-  std::sort(LexicalOrder, LexicalOrder+idcount, CompareIDDataIndex(IIDMap));
-  
-  // Write out the lexically-sorted table of persistent ids.
-  Offset LexicalOff = Out.tell();
-  for (unsigned i = 0; i < idcount ; ++i) Emit32(LexicalOrder[i]);
-  
-  for (unsigned i = 0; i < idcount; ++i) {
-    IDData& d = IIDMap[i];
-    d.FileOffset = Out.tell();            // Record the location for this data.  
-    unsigned len = d.II->getLength(); // Write out the string length.
-    Emit32(len);
-    const char* buf = d.II->getName(); // Write out the string data.
-    EmitBuf(buf, buf+len);
-    // Emit a null character for those clients expecting that IdentifierInfo
-    // strings are null terminated.
-    Emit8('\0');
-  }
-  
-  // Now emit the table mapping from persistent IDs to PTH file offsets.  
-  Offset IDOff = Out.tell();
-  Emit32(idcount);  // Emit the number of identifiers.
-  for (unsigned i = 0 ; i < idcount; ++i) Emit32(IIDMap[i].FileOffset);
-
-  return std::make_pair(IDOff, LexicalOff);
-}
-
-
 PCHEntry PTHWriter::LexTokens(Lexer& L) {
   // Pad 0's so that we emit tokens to a 4-byte alignment.
   // This speed up reading them back in.
@@ -657,3 +587,92 @@
   PW.GeneratePTH();
 }
 
+//===----------------------------------------------------------------------===//
+
+namespace {
+class VISIBILITY_HIDDEN PCHIdKey {
+public:
+  const IdentifierInfo* II;
+  uint32_t FileOffset;
+};
+
+class VISIBILITY_HIDDEN PCHIdentifierTableTrait {
+public:
+  typedef PCHIdKey* key_type;
+  typedef key_type  key_type_ref;
+  
+  typedef uint32_t  data_type;
+  typedef data_type data_type_ref;
+  
+  static unsigned ComputeHash(PCHIdKey* key) {
+    return BernsteinHash(key->II->getName());
+  }
+  
+  static std::pair<unsigned,unsigned> 
+  EmitKeyDataLength(llvm::raw_ostream& Out, const PCHIdKey* key, uint32_t) {    
+    unsigned n = strlen(key->II->getName()) + 1;
+    ::Emit16(Out, n);
+    return std::make_pair(n, sizeof(uint32_t));
+  }
+  
+  static void EmitKey(llvm::raw_fd_ostream& Out, PCHIdKey* key, unsigned n) {
+    // Record the location of the key data.  This is used when generating
+    // the mapping from persistent IDs to strings.
+    key->FileOffset = Out.tell();
+    Out.write(key->II->getName(), n);
+  }
+  
+  static void EmitData(llvm::raw_ostream& Out, uint32_t pID, unsigned) {
+    ::Emit32(Out, pID);
+  }        
+};
+} // end anonymous namespace
+
+/// EmitIdentifierTable - Emits two tables to the PTH file.  The first is
+///  a hashtable mapping from identifier strings to persistent IDs.  The second
+///  is a straight table mapping from persistent IDs to string data (the
+///  keys of the first table).
+///
+std::pair<Offset,Offset> PTHWriter::EmitIdentifierTable() {
+  // Build two maps:
+  //  (1) an inverse map from persistent IDs -> (IdentifierInfo*,Offset)
+  //  (2) a map from (IdentifierInfo*, Offset)* -> persistent IDs
+
+  // Note that we use 'calloc', so all the bytes are 0.
+  PCHIdKey* IIDMap = (PCHIdKey*) calloc(idcount, sizeof(PCHIdKey));
+
+  // Create the hashtable.
+  OnDiskChainedHashTableGenerator<PCHIdentifierTableTrait> IIOffMap;
+  
+  // Generate mapping from persistent IDs -> IdentifierInfo*.
+  for (IDMap::iterator I=IM.begin(), E=IM.end(); I!=E; ++I) {
+    // Decrement by 1 because we are using a vector for the lookup and
+    // 0 is reserved for NULL.
+    assert(I->second > 0);
+    assert(I->second-1 < idcount);
+    unsigned idx = I->second-1;
+    
+    // Store the mapping from persistent ID to IdentifierInfo*
+    IIDMap[idx].II = I->first;
+    
+    // Store the reverse mapping in a hashtable.
+    IIOffMap.insert(&IIDMap[idx], I->second);
+  }
+  
+  // Write out the inverse map first.  This causes the PCIDKey entries to
+  // record PTH file offsets for the string data.  This is used to write
+  // the second table.
+  Offset StringTableOffset = IIOffMap.Emit(Out);
+  
+  // Now emit the table mapping from persistent IDs to PTH file offsets.  
+  Offset IDOff = Out.tell();
+  Emit32(idcount);  // Emit the number of identifiers.
+  for (unsigned i = 0 ; i < idcount; ++i) Emit32(IIDMap[i].FileOffset);
+  
+  // Finally, release the inverse map.
+  free(IIDMap);
+  
+  return std::make_pair(IDOff, StringTableOffset);
+}
+
+

Modified: cfe/trunk/include/clang/Basic/IdentifierTable.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/IdentifierTable.h?rev=64326&r1=64325&r2=64326&view=diff

==============================================================================
--- cfe/trunk/include/clang/Basic/IdentifierTable.h (original)
+++ cfe/trunk/include/clang/Basic/IdentifierTable.h Wed Feb 11 15:29:16 2009
@@ -83,24 +83,26 @@
   ///
   const char *getName() const {    
     if (Entry) return Entry->getKeyData();
+    // FIXME: This is gross. It would be best not to embed specific details
+    // of the PTH file format here.
     // The 'this' pointer really points to a 
     // std::pair<IdentifierInfo, const char*>, where internal pointer
     // points to the external string data.
-    return ((std::pair<IdentifierInfo, const char*>*) this)->second + 4;
+    return ((std::pair<IdentifierInfo, const char*>*) this)->second;
   }
   
   /// getLength - Efficiently return the length of this identifier info.
   ///
   unsigned getLength() const {
     if (Entry) return Entry->getKeyLength();
+    // FIXME: This is gross. It would be best not to embed specific details
+    // of the PTH file format here.
     // The 'this' pointer really points to a 
     // std::pair<IdentifierInfo, const char*>, where internal pointer
     // points to the external string data.
-    const char* p = ((std::pair<IdentifierInfo, const char*>*) this)->second;
-    return ((unsigned) p[0])
-      | (((unsigned) p[1]) << 8)
-      | (((unsigned) p[2]) << 16)
-      | (((unsigned) p[3]) << 24);   
+    const char* p = ((std::pair<IdentifierInfo, const char*>*) this)->second-2;
+    return (((unsigned) p[0])
+        | (((unsigned) p[1]) << 8)) - 1;
   }
   
   /// hasMacroDefinition - Return true if this identifier is #defined to some

Modified: cfe/trunk/include/clang/Lex/PTHManager.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Lex/PTHManager.h?rev=64326&r1=64325&r2=64326&view=diff

==============================================================================
--- cfe/trunk/include/clang/Lex/PTHManager.h (original)
+++ cfe/trunk/include/clang/Lex/PTHManager.h Wed Feb 11 15:29:16 2009
@@ -53,9 +53,9 @@
   ///  reconsitute an IdentifierInfo.
   const unsigned char* const IdDataTable;
   
-  /// SortedIdTable - Array ordering persistent identifier IDs by the lexical
-  ///  order of their corresponding strings.  This is used by get().
-  const unsigned char* const SortedIdTable;
+  /// SortedIdTable - Abstract data structure mapping from strings to
+  ///  persistent IDs.  This is used by get().
+  void* StringIdLookup;
 
   /// NumIds - The number of identifiers in the PTH file.
   const unsigned NumIds;
@@ -72,7 +72,7 @@
   /// method.
   PTHManager(const llvm::MemoryBuffer* buf, void* fileLookup,
              const unsigned char* idDataTable, IdentifierInfo** perIDCache,
-             const unsigned char* sortedIdTable, unsigned numIds,
+             void* stringIdLookup, unsigned numIds,
              const unsigned char* spellingBase);
 
   // Do not implement.
@@ -83,7 +83,6 @@
   ///  spelling for a token.
   unsigned getSpellingAtPTHOffset(unsigned PTHOffset, const char*& Buffer);
   
-  
   /// GetIdentifierInfo - Used to reconstruct IdentifierInfo objects from the
   ///  PTH file.
   inline IdentifierInfo* GetIdentifierInfo(unsigned PersistentID) {
@@ -96,7 +95,7 @@
   
 public:
   // The current PTH version.
-  enum { Version = 3 };
+  enum { Version = 5 };
 
   ~PTHManager();
   

Modified: cfe/trunk/lib/Lex/PTHLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/PTHLexer.cpp?rev=64326&r1=64325&r2=64326&view=diff

==============================================================================
--- cfe/trunk/lib/Lex/PTHLexer.cpp (original)
+++ cfe/trunk/lib/Lex/PTHLexer.cpp Wed Feb 11 15:29:16 2009
@@ -59,6 +59,21 @@
   return V;
 }
 
+// Bernstein hash function:
+// This is basically copy-and-paste from StringMap.  This likely won't
+// stay here, which is why I didn't both to expose this function from
+// String Map.
+static unsigned BernsteinHash(const char* x) {
+  unsigned int R = 0;
+  for ( ; *x != '\0' ; ++x) R = R * 33 + *x;
+  return R + (R >> 5);
+}
+
+static unsigned BernsteinHash(const char* x, unsigned n) {
+  unsigned int R = 0;
+  for (unsigned i = 0 ; i < n ; ++i, ++x) R = R * 33 + *x;
+  return R + (R >> 5);
+}
 
 //===----------------------------------------------------------------------===//
 // PTHLexer methods.
@@ -447,10 +462,7 @@
   }
 
   static unsigned ComputeHash(const char* x) {
-    // More copy-paste nonsense.  Will refactor.
-    unsigned int R = 0;
-    for (; *x != '\0' ; ++x) R = R * 33 + *x;
-    return R + (R >> 5);
+    return BernsteinHash(x);
   }
 
   static const char* GetInternalKey(const FileEntry* FE) {
@@ -472,9 +484,51 @@
     return PTHFileData(x, y); 
   }
 };
+
+class VISIBILITY_HIDDEN PTHStringLookupTrait {
+public:
+  typedef uint32_t 
+          data_type;
+
+  typedef const std::pair<const char*, unsigned>
+          external_key_type;
+
+  typedef external_key_type internal_key_type;
+  
+  static bool EqualKey(const internal_key_type& a,
+                       const internal_key_type& b) {
+    return (a.second == b.second) ? memcmp(a.first, b.first, a.second) == 0
+                                  : false;
+  }
+  
+  static unsigned ComputeHash(const internal_key_type& a) {
+    return BernsteinHash(a.first, a.second);
+  }
+  
+  // This hopefully will just get inlined and removed by the optimizer.
+  static const internal_key_type&
+  GetInternalKey(const external_key_type& x) { return x; }
+  
+  static std::pair<unsigned, unsigned>
+  ReadKeyDataLength(const unsigned char*& d) {
+    return std::make_pair((unsigned) ReadUnalignedLE16(d), sizeof(uint32_t));
+  }
+    
+  static std::pair<const char*, unsigned>
+  ReadKey(const unsigned char* d, unsigned n) {
+      assert(n >= 2 && d[n-1] == '\0');
+      return std::make_pair((const char*) d, n-1);
+    }
+    
+  static uint32_t ReadData(const unsigned char* d, unsigned) {
+    return ::ReadUnalignedLE32(d);
+  }
+};
+  
 } // end anonymous namespace  
 
-typedef OnDiskChainedHashTable<PTHFileLookupTrait> PTHFileLookup;
+typedef OnDiskChainedHashTable<PTHFileLookupTrait>   PTHFileLookup;
+typedef OnDiskChainedHashTable<PTHStringLookupTrait> PTHStringIdLookup;
 
 //===----------------------------------------------------------------------===//
 // PTHManager methods.
@@ -483,15 +537,16 @@
 PTHManager::PTHManager(const llvm::MemoryBuffer* buf, void* fileLookup,
                        const unsigned char* idDataTable,
                        IdentifierInfo** perIDCache, 
-                       const unsigned char* sortedIdTable, unsigned numIds,
+                       void* stringIdLookup, unsigned numIds,
                        const unsigned char* spellingBase)
 : Buf(buf), PerIDCache(perIDCache), FileLookup(fileLookup),
-  IdDataTable(idDataTable), SortedIdTable(sortedIdTable),
+  IdDataTable(idDataTable), StringIdLookup(stringIdLookup),
   NumIds(numIds), PP(0), SpellingBase(spellingBase) {}
 
 PTHManager::~PTHManager() {
   delete Buf;
   delete (PTHFileLookup*) FileLookup;
+  delete (PTHStringIdLookup*) StringIdLookup;
   free(PerIDCache);
 }
 
@@ -513,7 +568,7 @@
                                                "PTH file %0 could not be read");
       Diags->Report(FullSourceLoc(), DiagID) << file;
     }
-    
+
     return 0;
   }
   
@@ -575,13 +630,21 @@
     return 0;
   }
   
-  // Get the location of the lexigraphically-sorted table of persistent IDs.
-  const unsigned char* SortedIdTableOffset = EndTable + sizeof(uint32_t)*1;
-  const unsigned char* SortedIdTable = BufBeg + ReadLE32(SortedIdTableOffset);
-  if (!(SortedIdTable >= BufBeg && SortedIdTable < BufEnd)) {
+  // Get the location of the hashtable mapping between strings and
+  // persistent IDs.
+  const unsigned char* StringIdTableOffset = EndTable + sizeof(uint32_t)*1;
+  const unsigned char* StringIdTable = BufBeg + ReadLE32(StringIdTableOffset);
+  if (!(StringIdTable >= BufBeg && StringIdTable < BufEnd)) {
     InvalidPTH(Diags);
     return 0;
   }
+
+  llvm::OwningPtr<PTHStringIdLookup> SL(PTHStringIdLookup::Create(StringIdTable,
+                                                                  BufBeg));
+  if (SL->isEmpty()) {
+    InvalidPTH(Diags, "PTH file contains no identifiers.");
+    return 0;
+  }
   
   // Get the location of the spelling cache.
   const unsigned char* spellingBaseOffset = EndTable + sizeof(uint32_t)*3;
@@ -609,7 +672,7 @@
 
   // Create the new PTHManager.
   return new PTHManager(File.take(), FL.take(), IData, PerIDCache,
-                        SortedIdTable, NumIds, spellingBase);
+                        SL.take(), NumIds, spellingBase);
 }
 IdentifierInfo* PTHManager::LazilyCreateIdentifierInfo(unsigned PersistentID) {
   // Look in the PTH file for the string data for the IdentifierInfo object.
@@ -623,56 +686,28 @@
     Alloc.Allocate<std::pair<IdentifierInfo,const unsigned char*> >();
 
   Mem->second = IDData;
+  assert(IDData[0] != '\0');
   IdentifierInfo *II = new ((void*) Mem) IdentifierInfo();
   
   // Store the new IdentifierInfo in the cache.
   PerIDCache[PersistentID] = II;
+  assert(II->getName() && II->getName()[0] != '\0');
   return II;
 }
 
 IdentifierInfo* PTHManager::get(const char *NameStart, const char *NameEnd) {
-  unsigned min = 0;
-  unsigned max = NumIds;
-  unsigned Len = NameEnd - NameStart;
-  
-  do {
-    unsigned i = (max - min) / 2 + min;
-    const unsigned char *Ptr = SortedIdTable + (i * 4);
-    
-    // Read the persistentID.
-    unsigned perID = ReadLE32(Ptr);
-    
-    // Get the IdentifierInfo.
-    IdentifierInfo* II = GetIdentifierInfo(perID);
-    
-    // First compare the lengths.
-    unsigned IILen = II->getLength();
-    if (Len < IILen) goto IsLess;
-    if (Len > IILen) goto IsGreater;
-    
-    // Now compare the strings!
-    {
-      signed comp = strncmp(NameStart, II->getName(), Len);
-      if (comp < 0) goto IsLess;
-      if (comp > 0) goto IsGreater;
-    }    
-    // We found a match!
-    return II;
-    
-  IsGreater:
-    if (i == min) break;
-    min = i;
-    continue;
-    
-  IsLess:
-    max = i;
-    assert(!(max == min) || (min == i));
-  }
-  while (min != max);
-  
-  return 0;
-}
+  PTHStringIdLookup& SL = *((PTHStringIdLookup*)StringIdLookup);
+  // Double check our assumption that the last character isn't '\0'.
+  assert(NameStart[NameEnd-NameStart-1] != '\0');
+  PTHStringIdLookup::iterator I = SL.find(std::make_pair(NameStart,
+                                                         NameEnd - NameStart));
+  if (I == SL.end()) // No identifier found?
+    return 0;
 
+  // Match found.  Return the identifier!
+  assert(*I > 0);
+  return GetIdentifierInfo(*I-1);
+}
 
 PTHLexer *PTHManager::CreateLexer(FileID FID) {
   const FileEntry *FE = PP->getSourceManager().getFileEntryForID(FID);





More information about the cfe-commits mailing list