[cfe-commits] r64245 - in /cfe/trunk: Driver/CacheTokens.cpp lib/Lex/PTHLexer.cpp

Tue Feb 10 14:16:22 PST 2009

Author: kremenek
Date: Tue Feb 10 16:16:22 2009
New Revision: 64245

URL: http://llvm.org/viewvc/llvm-project?rev=64245&view=rev
Log:
PTH: Replace ad hoc 'file name' -> 'PTH data' lookup table in the PTH file with an on-disk chained hash table.  This data structure is implemented using templates, and will be used to replace similar data structures.  This change leads to no visibile performance impact on Cocoa.h, but now we only pay a price for the table on order with the number of files accessed and not the number in the PTH file.

Modified:
    cfe/trunk/Driver/CacheTokens.cpp
    cfe/trunk/lib/Lex/PTHLexer.cpp

Modified: cfe/trunk/Driver/CacheTokens.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/Driver/CacheTokens.cpp?rev=64245&r1=64244&r2=64245&view=diff

==============================================================================

--- cfe/trunk/Driver/CacheTokens.cpp (original)
+++ cfe/trunk/Driver/CacheTokens.cpp Tue Feb 10 16:16:22 2009
@@ -47,9 +47,10 @@
   Out << (unsigned char)(V >> 24);
 }
 
-static void Pad(llvm::raw_fd_ostream& Out, unsigned Alignment) {
-  Offset off = (Offset) Out.tell();  
-  for (unsigned Pad = off % Alignment ; Pad != 0 ; --Pad, ++off) Emit8(Out, 0);
+static void Pad(llvm::raw_fd_ostream& Out, unsigned A) {
+  Offset off = (Offset) Out.tell();
+  uint32_t n = ((uintptr_t)(off+A-1) & ~(uintptr_t)(A-1)) - off;
+  for ( ; n ; --n ) Emit8(Out, 0);
 }
 
 //===----------------------------------------------------------------------===//
@@ -65,13 +66,13 @@
   
   class Item {
   public:
-    typename Info::KeyT key;
-    typename Info::DataT data;
+    typename Info::key_type key;
+    typename Info::data_type data;
     Item *next;
     const uint32_t hash;
     
-    Item(typename Info::KeyT_ref k, typename Info::DataT_ref d)
-    : key(k), data(d), next(0), hash(Info::getHash(k)) {}
+    Item(typename Info::key_type_ref k, typename Info::data_type_ref d)
+    : key(k), data(d), next(0), hash(Info::ComputeHash(k)) {}
   };
   
   class Bucket { 
@@ -86,7 +87,7 @@
   Bucket* Buckets;
   
 private:
-  void insert(Item** b, size_t size, Item* E) {
+  void insert(Bucket* b, size_t size, Item* E) {
     unsigned idx = E->hash & (size - 1);
     Bucket& B = b[idx];
     E->next = B.head;
@@ -95,12 +96,12 @@
   }
   
   void resize(size_t newsize) {
-    Bucket* newBuckets = calloc(newsize, sizeof(Bucket));
-    
+    Bucket* newBuckets = (Bucket*) calloc(newsize, sizeof(Bucket));
+    // Populate newBuckets with the old entries.
     for (unsigned i = 0; i < NumBuckets; ++i)
-      for (Item* E = Buckets[i]; E ; ) {
+      for (Item* E = Buckets[i].head; E ; ) {
         Item* N = E->next;
-        E->Next = 0;
+        E->next = 0;
         insert(newBuckets, newsize, E);
         E = N;
       }
@@ -112,7 +113,9 @@
   
 public:
   
-  void insert(typename Info::Key_ref key, typename Info::DataT_ref data) {
+  void insert(typename Info::key_type_ref key,
+              typename Info::data_type_ref data) {
+
     ++NumEntries;
     if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
     insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data));
@@ -125,25 +128,26 @@
       if (!B.head) continue;
       
       // Store the offset for the data of this bucket.
-      Pad(out, 4); // 4-byte alignment.
       B.off = out.tell();
       
-      // Write out the number of items in the bucket.  We just write out
-      // 4 bytes to keep things 4-byte aligned.
-      Emit32(out, B.length);
+      // Write out the number of items in the bucket.
+      Emit16(out, B.length);
       
       // Write out the entries in the bucket.
       for (Item *I = B.head; I ; I = I->next) {
         Emit32(out, I->hash);
-        Info::EmitKey(out, I->key);
-        Info::EmitData(out, I->data);
+        const std::pair<unsigned, unsigned>& Len = 
+          Info::EmitKeyDataLength(out, I->key, I->data);
+        Info::EmitKey(out, I->key, Len.first);
+        Info::EmitData(out, I->data, Len.second);
       }
     }
     
     // Emit the hashtable itself.
     Pad(out, 4);
     Offset TableOff = out.tell();
-    Emit32(out, NumBuckets);    
+    Emit32(out, NumBuckets);
+    Emit32(out, NumEntries);
     for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
     
     return TableOff;
@@ -151,8 +155,10 @@
   
   OnDiskChainedHashTableGenerator() {
     NumEntries = 0;
-    NumBuckets = 64;
-    Buckets = calloc(NumBuckets, sizeof(Bucket));
+    NumBuckets = 64;    
+    // Note that we do not need to run the constructors of the individual
+    // Bucket objects since 'calloc' returns bytes that are all 0.
+    Buckets = (Bucket*) calloc(NumBuckets, sizeof(Bucket));
   }
   
   ~OnDiskChainedHashTableGenerator() {
@@ -178,6 +184,44 @@
   Offset getPPCondTableOffset() const { return PPCondData; }
 };
   
+class VISIBILITY_HIDDEN FileEntryPCHEntryInfo {
+public:
+  typedef const FileEntry* key_type;
+  typedef key_type key_type_ref;
+  
+  typedef PCHEntry data_type;
+  typedef const PCHEntry& data_type_ref;
+  
+  static unsigned ComputeHash(const FileEntry* FE) {
+    // Bernstein hash function:
+    // This is basically copy-and-paste from StringMap.  This likely won't
+    // stay here, which is why I didn't both to expose this function from
+    // String Map.  There are plenty of other hash functions which are likely
+    // to perform better and be faster.
+    unsigned int R = 0;
+    for (const char* x = FE->getName(); *x != '\0' ; ++x) R = R * 33 + *x;
+    return R + (R >> 5);
+  }
+  
+  static std::pair<unsigned,unsigned> 
+  EmitKeyDataLength(llvm::raw_ostream& Out, const FileEntry* FE,
+                    const PCHEntry& E) {
+
+    unsigned n = strlen(FE->getName()) + 1;
+    ::Emit16(Out, n);
+    return std::make_pair(n, 8);
+  }
+  
+  static void EmitKey(llvm::raw_ostream& Out, const FileEntry* FE, unsigned n) {
+    Out.write(FE->getName(), n);
+  }
+  
+  static void EmitData(llvm::raw_ostream& Out, const PCHEntry& E, unsigned) {
+    ::Emit32(Out, E.getTokenOffset());
+    ::Emit32(Out, E.getPPCondTableOffset());
+  }        
+};
+  
 class OffsetOpt {
   bool valid;
   Offset off;
@@ -189,7 +233,7 @@
 };
 } // end anonymous namespace
 
-typedef llvm::DenseMap<const FileEntry*, PCHEntry> PCHMap;
+typedef OnDiskChainedHashTableGenerator<FileEntryPCHEntryInfo> PCHMap;
 typedef llvm::DenseMap<const IdentifierInfo*,uint32_t> IDMap;
 typedef llvm::StringMap<OffsetOpt, llvm::BumpPtrAllocator> CachedStrsTy;
 
@@ -547,14 +591,14 @@
     if (!P.isAbsolute())
       continue;
 
-    assert(!PM.count(FE) && "fileinfo's are not uniqued on FileEntry?");
+    // assert(!PM.count(FE) && "fileinfo's are not uniqued on FileEntry?");
     
     const llvm::MemoryBuffer *B = C.getBuffer();
     if (!B) continue;
 
     FileID FID = SM.createFileID(FE, SourceLocation(), SrcMgr::C_User);
     Lexer L(FID, SM, LOpts);
-    PM[FE] = LexTokens(L);
+    PM.insert(FE, LexTokens(L));
   }
 
   // Write out the identifier table.
@@ -607,22 +651,5 @@
 //===----------------------------------------------------------------------===//
 
 Offset PTHWriter::EmitFileTable() {
-  // Determine the offset where this table appears in the PTH file.
-  Offset off = (Offset) Out.tell();
-  
-  // Output the size of the table.
-  Emit32(PM.size());
-  
-  for (PCHMap::iterator I=PM.begin(), E=PM.end(); I!=E; ++I) {
-    const FileEntry* FE = I->first;
-    const char* Name = FE->getName();
-    unsigned size = strlen(Name);
-    Emit32(size);
-    EmitBuf(Name, Name+size);
-    Emit32(I->second.getTokenOffset());
-    Emit32(I->second.getPPCondTableOffset());
-  }
-  
-  return off;
+  return PM.Emit(Out);
 }
-

Modified: cfe/trunk/lib/Lex/PTHLexer.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Lex/PTHLexer.cpp?rev=64245&r1=64244&r2=64245&view=diff

==============================================================================
--- cfe/trunk/lib/Lex/PTHLexer.cpp (original)
+++ cfe/trunk/lib/Lex/PTHLexer.cpp Tue Feb 10 16:16:22 2009
@@ -34,12 +34,21 @@
 //===----------------------------------------------------------------------===//
 
 static inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
-  uint16_t V = ((uint16_t)Data[0] <<  0) |
+  uint16_t V = ((uint16_t)Data[0]) |
                ((uint16_t)Data[1] <<  8);
   Data += 2;
   return V;
 }
 
+static inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
+  uint32_t V = ((uint32_t)Data[0])  |
+               ((uint32_t)Data[1] << 8)  |
+               ((uint32_t)Data[2] << 16) |
+               ((uint32_t)Data[3] << 24);
+  Data += 4;
+  return V;
+}
+
 static inline uint32_t ReadLE32(const unsigned char *&Data) {
   // Hosts that directly support little-endian 32-bit loads can just
   // use them.  Big-endian hosts need a bswap.
@@ -306,70 +315,166 @@
 }
 
 //===----------------------------------------------------------------------===//
-// Internal Data Structures for PTH file lookup and resolving identifiers.
+// OnDiskChainedHashTable
 //===----------------------------------------------------------------------===//
 
+template<typename Info>
+class OnDiskChainedHashTable {
+  const unsigned NumBuckets;
+  const unsigned NumEntries;
+  const unsigned char* const Buckets;
+  const unsigned char* const Base;
+public:
+  typedef typename Info::internal_key_type internal_key_type;
+  typedef typename Info::external_key_type external_key_type;
+  typedef typename Info::data_type         data_type;
+  
+  OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
+                         const unsigned char* buckets,
+                         const unsigned char* base)
+    : NumBuckets(numBuckets), NumEntries(numEntries),
+      Buckets(buckets), Base(base) {        
+        assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
+               "'buckets' must have a 4-byte alignment");
+      }
+
+  
+  bool isEmpty() const { return NumEntries == 0; }
+  
+  class iterator {
+    const unsigned char* const data;
+    const unsigned len;
+  public:
+    iterator() : data(0), len(0) {}
+    iterator(const unsigned char* d, unsigned l) : data(d), len(l) {}
+    
+    data_type operator*() const { return Info::ReadData(data, len); }    
+    bool operator==(const iterator& X) const { return X.data == data; }    
+    bool operator!=(const iterator& X) const { return X.data != data; }
+  };    
+  
+  iterator find(const external_key_type& eKey) {
+    const internal_key_type& iKey = Info::GetInternalKey(eKey);
+    unsigned key_hash = Info::ComputeHash(iKey);
+    
+    // Each bucket is just a 32-bit offset into the PTH file.
+    unsigned idx = key_hash & (NumBuckets - 1);
+    const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
+    
+    unsigned offset = ReadLE32(Bucket);
+    if (offset == 0) return iterator(); // Empty bucket.
+    const unsigned char* Items = Base + offset;
+    
+    // 'Items' starts with a 16-bit unsigned integer representing the
+    // number of items in this bucket.
+    unsigned len = ReadUnalignedLE16(Items);
+    
+    for (unsigned i = 0; i < len; ++i) {
+      // Read the hash.
+      uint32_t item_hash = ReadUnalignedLE32(Items);
+      
+      // Determine the length of the key and the data.
+      const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);      
+      unsigned item_len = L.first + L.second;
+
+      // Compare the hashes.  If they are not the same, skip the entry entirely.
+      if (item_hash != key_hash) {
+        Items += item_len;
+        continue;
+      }
+      
+      // Read the key.
+      const internal_key_type& X =
+        Info::ReadKey((const unsigned char* const) Items, L.first);
+
+      // If the key doesn't match just skip reading the value.
+      if (!Info::EqualKey(X, iKey)) {
+        Items += item_len;
+        continue;
+      }
+      
+      // The key matches!
+      return iterator(Items + L.first, L.second);
+    }
+    
+    return iterator();
+  }
+  
+  iterator end() const { return iterator(); }
+  
+  
+  static OnDiskChainedHashTable* Create(const unsigned char* buckets,
+                                        const unsigned char* const base) {
+
+    assert(buckets > base);
+    assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
+           "buckets should be 4-byte aligned.");
+    
+    unsigned numBuckets = ReadLE32(buckets);
+    unsigned numEntries = ReadLE32(buckets);
+    return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
+                                            base);
+  }  
+};
+
+//===----------------------------------------------------------------------===//
+// PTH file lookup: map from strings to file data.
+//===----------------------------------------------------------------------===//
 
 /// PTHFileLookup - This internal data structure is used by the PTHManager
 ///  to map from FileEntry objects managed by FileManager to offsets within
 ///  the PTH file.
 namespace {
-class VISIBILITY_HIDDEN PTHFileLookup {
+class VISIBILITY_HIDDEN PTHFileData {
+  const uint32_t TokenOff;
+  const uint32_t PPCondOff;
 public:
-  class Val {
-    uint32_t TokenOff;
-    uint32_t PPCondOff;
-  public:
-    Val() : TokenOff(~0) {}
-    Val(uint32_t toff, uint32_t poff)
-      : TokenOff(toff), PPCondOff(poff) {}
-    
-    bool isValid() const { return TokenOff != ~((uint32_t)0); }
-
-    uint32_t getTokenOffset() const {
-      assert(isValid() && "PTHFileLookup entry initialized.");
-      return TokenOff;
-    }
+  PTHFileData(uint32_t tokenOff, uint32_t ppCondOff)
+    : TokenOff(tokenOff), PPCondOff(ppCondOff) {}
     
-    uint32_t getPPCondOffset() const {
-      assert(isValid() && "PTHFileLookup entry initialized.");
-      return PPCondOff;
-    }    
-  };
-  
-private:
-  llvm::StringMap<Val> FileMap;
+  uint32_t getTokenOffset() const { return TokenOff; }  
+  uint32_t getPPCondOffset() const { return PPCondOff; }  
+};
   
+class VISIBILITY_HIDDEN PTHFileLookupTrait {
 public:
-  PTHFileLookup() {};
+  typedef PTHFileData      data_type;
+  typedef const FileEntry* external_key_type;
+  typedef const char*      internal_key_type;
   
-  bool isEmpty() const {
-    return FileMap.empty();
+  static bool EqualKey(const char* a, const char* b) {
+    return strcmp(a, b) == 0;
+  }
+
+  static unsigned ComputeHash(const char* x) {
+    // More copy-paste nonsense.  Will refactor.
+    unsigned int R = 0;
+    for (; *x != '\0' ; ++x) R = R * 33 + *x;
+    return R + (R >> 5);
+  }
+
+  static const char* GetInternalKey(const FileEntry* FE) {
+    return FE->getName();
   }
   
-  Val Lookup(const FileEntry* FE) {
-    const char* s = FE->getName();
-    unsigned size = strlen(s);
-    return FileMap.GetOrCreateValue(s, s+size).getValue();
+  static std::pair<unsigned, unsigned>
+  ReadKeyDataLength(const unsigned char*& d) {
+    return std::make_pair((unsigned) ReadUnalignedLE16(d), 8U);
   }
   
-  void ReadTable(const unsigned char* D) {    
-    uint32_t N = ReadLE32(D);     // Read the length of the table.
-    
-    for ( ; N > 0; --N) {       // The rest of the data is the table itself.
-      uint32_t Len = ReadLE32(D);
-      const char* s = (const char *)D;
-      D += Len;
-
-      uint32_t TokenOff = ReadLE32(D);
-      uint32_t PPCondOff = ReadLE32(D);
-
-      FileMap.GetOrCreateValue(s, s+Len).getValue() =
-        Val(TokenOff, PPCondOff);
-    }
+  static const char* ReadKey(const unsigned char* d, unsigned) {
+    return (const char*) d;
+  }
+  
+  static PTHFileData ReadData(const unsigned char* d, unsigned) {
+    uint32_t x = ::ReadUnalignedLE32(d);
+    uint32_t y = ::ReadUnalignedLE32(d);
+    return PTHFileData(x, y); 
   }
 };
-} // end anonymous namespace
+} // end anonymous namespace  
+
+typedef OnDiskChainedHashTable<PTHFileLookupTrait> PTHFileLookup;
 
 //===----------------------------------------------------------------------===//
 // PTHManager methods.
@@ -454,9 +559,7 @@
     return 0; // FIXME: Proper error diagnostic?
   }
   
-  llvm::OwningPtr<PTHFileLookup> FL(new PTHFileLookup());
-  FL->ReadTable(FileTable);
-
+  llvm::OwningPtr<PTHFileLookup> FL(PTHFileLookup::Create(FileTable, BufBeg));
   if (FL->isEmpty()) {
     InvalidPTH(Diags, "PTH file contains no cached source data");
     return 0;
@@ -579,11 +682,14 @@
   // Lookup the FileEntry object in our file lookup data structure.  It will
   // return a variant that indicates whether or not there is an offset within
   // the PTH file that contains cached tokens.
-  PTHFileLookup::Val FileData = ((PTHFileLookup*)FileLookup)->Lookup(FE);
+  PTHFileLookup& PFL = *((PTHFileLookup*)FileLookup);
+  PTHFileLookup::iterator I = PFL.find(FE);
   
-  if (!FileData.isValid()) // No tokens available.
+  if (I == PFL.end()) // No tokens available?
     return 0;
   
+  const PTHFileData& FileData = *I;  
+  
   const unsigned char *BufStart = (const unsigned char *)Buf->getBufferStart();
   // Compute the offset of the token data within the buffer.
   const unsigned char* data = BufStart + FileData.getTokenOffset();