[cfe-commits] r39058 - in /cfe/cfe/trunk: Lex/IdentifierTable.cpp include/clang/Lex/IdentifierTable.h
sabre at cs.uiuc.edu
sabre at cs.uiuc.edu
Wed Jul 11 09:27:12 PDT 2007
Author: sabre
Date: Wed Jul 11 11:27:12 2007
New Revision: 39058
URL: http://llvm.org/viewvc/llvm-project?rev=39058&view=rev
Log:
reimplement identifier hash table in terms of a probed table instead of a chained
table. This is about 25% faster for identifier lookup. This also implements
resizing of the hash table.
Modified:
cfe/cfe/trunk/Lex/IdentifierTable.cpp
cfe/cfe/trunk/include/clang/Lex/IdentifierTable.h
Modified: cfe/cfe/trunk/Lex/IdentifierTable.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/Lex/IdentifierTable.cpp?rev=39058&r1=39057&r2=39058&view=diff
==============================================================================
--- cfe/cfe/trunk/Lex/IdentifierTable.cpp (original)
+++ cfe/cfe/trunk/Lex/IdentifierTable.cpp Wed Jul 11 11:27:12 2007
@@ -113,24 +113,22 @@
//===----------------------------------------------------------------------===//
-/// IdentifierLink - There is one of these allocated by IdentifierInfo.
-/// These form the linked list of buckets for the hash table.
+/// IdentifierBucket - The hash table consists of an array of these. If Info is
+/// non-null, this is an extant entry, otherwise, it is a hole.
struct IdentifierBucket {
- /// Next - This is the next bucket in the linked list.
- IdentifierBucket *Next;
+ /// FullHashValue - This remembers the full hash value of the identifier for
+ /// easy scanning.
+ unsigned FullHashValue;
- IdentifierInfo TokInfo;
- // NOTE: TokInfo must be the last element in this structure, as the string
- // information for the identifier is allocated right after it.
+ /// Info - This is a pointer to the actual identifier info object.
+ IdentifierInfo *Info;
};
-// FIXME: start hashtablesize off at 8K entries, GROW when density gets to 3.
-/// HASH_TABLE_SIZE - The current size of the hash table. Note that this must
-/// always be a power of two!
-static unsigned HASH_TABLE_SIZE = 8096*4;
-
IdentifierTable::IdentifierTable(const LangOptions &LangOpts) {
- IdentifierBucket **TableArray = new IdentifierBucket*[HASH_TABLE_SIZE]();
+ HashTableSize = 8192; // Start with space for 8K identifiers.
+ IdentifierBucket *TableArray = new IdentifierBucket[HashTableSize]();
+ memset(TableArray, 0, HashTableSize*sizeof(IdentifierBucket));
+
TheTable = TableArray;
NumIdentifiers = 0;
#if USE_ALLOCATOR
@@ -138,24 +136,22 @@
((MemRegion*)TheMemory)->Init(8*4096, 0);
#endif
- memset(TheTable, 0, HASH_TABLE_SIZE*sizeof(IdentifierBucket*));
-
+ // Populate the identifier table with info about keywords for the current
+ // language.
AddKeywords(LangOpts);
}
IdentifierTable::~IdentifierTable() {
- IdentifierBucket **TableArray = (IdentifierBucket**)TheTable;
- for (unsigned i = 0, e = HASH_TABLE_SIZE; i != e; ++i) {
- IdentifierBucket *Id = TableArray[i];
- while (Id) {
+ IdentifierBucket *TableArray = (IdentifierBucket*)TheTable;
+ for (unsigned i = 0, e = HashTableSize; i != e; ++i) {
+ if (IdentifierInfo *Id = TableArray[i].Info) {
// Free memory referenced by the identifier (e.g. macro info).
- Id->TokInfo.Destroy();
+ Id->Destroy();
- IdentifierBucket *Next = Id->Next;
#if !USE_ALLOCATOR
+ // Free the memory for the identifier itself.
free(Id);
#endif
- Id = Next;
}
}
#if USE_ALLOCATOR
@@ -177,62 +173,73 @@
IdentifierInfo &IdentifierTable::get(const char *NameStart,
const char *NameEnd) {
- IdentifierBucket **TableArray = (IdentifierBucket**)TheTable;
+ IdentifierBucket *TableArray = (IdentifierBucket*)TheTable;
- unsigned FullHash = HashString(NameStart, NameEnd);
- unsigned Hash = FullHash & (HASH_TABLE_SIZE-1);
+ unsigned HTSize = HashTableSize;
+ unsigned FullHashValue = HashString(NameStart, NameEnd);
+ unsigned BucketNo = FullHashValue & (HTSize-1);
unsigned Length = NameEnd-NameStart;
- IdentifierBucket *IdentHead = TableArray[Hash];
- for (IdentifierBucket *Identifier = IdentHead, *LastID = 0; Identifier;
- LastID = Identifier, Identifier = Identifier->Next) {
- if (Identifier->TokInfo.getNameLength() == Length &&
- Identifier->TokInfo.HashValue == FullHash &&
- memcmp(Identifier->TokInfo.getName(), NameStart, Length) == 0) {
- // If found identifier wasn't at start of bucket, move it there so
- // that frequently searched for identifiers are found earlier, even if
- // they first occur late in the source file.
- if (LastID) {
- LastID->Next = Identifier->Next;
- Identifier->Next = IdentHead;
- TableArray[Hash] = Identifier;
- }
-
- return Identifier->TokInfo;
- }
+ unsigned ProbeAmt = 1;
+ while (1) {
+ IdentifierBucket &Bucket = TableArray[BucketNo];
+ IdentifierInfo *BucketII = Bucket.Info;
+ // If we found an empty bucket, this identifier isn't in the table yet.
+ if (BucketII == 0) break;
+
+ // If the full hash value matches, check deeply for a match. The common
+ // case here is that we are only looking at the buckets (for identifier info
+ // being non-null and for the full hash value) not at the identifiers. This
+ // is important for cache locality.
+ if (Bucket.FullHashValue == FullHashValue &&
+ BucketII->getNameLength() == Length &&
+ memcmp(BucketII->getName(), NameStart, Length) == 0)
+ // We found a match!
+ return *BucketII;
+
+ // Okay, we didn't find the identifier. Probe to the next bucket.
+ BucketNo = (BucketNo+ProbeAmt) & (HashTableSize-1);
+
+ // Use quadratic probing, it has fewer clumping artifacts than linear
+ // probing and has good cache behavior in the common case.
+ ++ProbeAmt;
}
-
- // Allocate a new identifier, with space for the null-terminated string at the
- // end.
- unsigned AllocSize = sizeof(IdentifierBucket)+Length+1;
+
+ // Okay, the identifier doesn't already exist, and BucketNo is the bucket to
+ // fill in. Allocate a new identifier with space for the null-terminated
+ // string at the end.
+ unsigned AllocSize = sizeof(IdentifierInfo)+Length+1;
#if USE_ALLOCATOR
- IdentifierBucket *Identifier = (IdentifierBucket*)
+ IdentifierInfo *Identifier = (IdentifierInfo*)
((MemRegion*)TheMemory)->Allocate(AllocSize, (MemRegion**)&TheMemory);
#else
- IdentifierBucket *Identifier = (IdentifierBucket*)malloc(AllocSize);
+ IdentifierInfo *Identifier = (IdentifierInfo*)malloc(AllocSize);
#endif
- Identifier->TokInfo.NameLen = Length;
- Identifier->TokInfo.Macro = 0;
- Identifier->TokInfo.TokenID = tok::identifier;
- Identifier->TokInfo.PPID = tok::pp_not_keyword;
- Identifier->TokInfo.ObjCID = tok::objc_not_keyword;
- Identifier->TokInfo.IsExtension = false;
- Identifier->TokInfo.IsPoisoned = false;
- Identifier->TokInfo.IsOtherTargetMacro = false;
- Identifier->TokInfo.FETokenInfo = 0;
- Identifier->TokInfo.HashValue = FullHash;
+ Identifier->NameLen = Length;
+ Identifier->Macro = 0;
+ Identifier->TokenID = tok::identifier;
+ Identifier->PPID = tok::pp_not_keyword;
+ Identifier->ObjCID = tok::objc_not_keyword;
+ Identifier->IsExtension = false;
+ Identifier->IsPoisoned = false;
+ Identifier->IsOtherTargetMacro = false;
+ Identifier->FETokenInfo = 0;
+ ++NumIdentifiers;
// Copy the string information.
char *StrBuffer = (char*)(Identifier+1);
memcpy(StrBuffer, NameStart, Length);
StrBuffer[Length] = 0; // Null terminate string.
- // Link it into the hash table. Adding it to the start of the hash table is
- // useful for buckets with lots of entries. This means that more recently
- // referenced identifiers will be near the head of the bucket.
- Identifier->Next = IdentHead;
- TableArray[Hash] = Identifier;
- return Identifier->TokInfo;
+ // Fill in the bucket for the hash table.
+ TableArray[BucketNo].Info = Identifier;
+ TableArray[BucketNo].FullHashValue = FullHashValue;
+
+ // If the hash table is now more than 3/4 full, rehash into a larger table.
+ if (NumIdentifiers > HashTableSize*3/4)
+ RehashTable();
+
+ return *Identifier;
}
IdentifierInfo &IdentifierTable::get(const std::string &Name) {
@@ -242,13 +249,51 @@
return get(NameBytes, NameBytes+Size);
}
+void IdentifierTable::RehashTable() {
+ unsigned NewSize = HashTableSize*2;
+ IdentifierBucket *NewTableArray = new IdentifierBucket[NewSize]();
+ memset(NewTableArray, 0, NewSize*sizeof(IdentifierBucket));
+
+ // Rehash all the identifier into their new buckets. Luckily we already have
+ // the hash values available :).
+ IdentifierBucket *CurTable = (IdentifierBucket *)TheTable;
+ for (IdentifierBucket *IB = CurTable, *E = CurTable+HashTableSize;
+ IB != E; ++IB) {
+ if (IB->Info) {
+ // Fast case, bucket available.
+ unsigned FullHash = IB->FullHashValue;
+ unsigned NewBucket = FullHash & (NewSize-1);
+ if (NewTableArray[NewBucket].Info == 0) {
+ NewTableArray[FullHash & (NewSize-1)].Info = IB->Info;
+ NewTableArray[FullHash & (NewSize-1)].FullHashValue = FullHash;
+ continue;
+ }
+
+ unsigned ProbeSize = 1;
+ do {
+ NewBucket = (NewBucket + ProbeSize++) & (NewSize-1);
+ } while (NewTableArray[NewBucket].Info);
+
+ // Finally found a slot. Fill it in.
+ NewTableArray[FullHash & (NewSize-1)].Info = IB->Info;
+ NewTableArray[FullHash & (NewSize-1)].FullHashValue = FullHash;
+ }
+ }
+
+ delete[] CurTable;
+
+ TheTable = NewTableArray;
+ HashTableSize = NewSize;
+}
+
+
/// VisitIdentifiers - This method walks through all of the identifiers,
/// invoking IV->VisitIdentifier for each of them.
void IdentifierTable::VisitIdentifiers(const IdentifierVisitor &IV) {
- IdentifierBucket **TableArray = (IdentifierBucket**)TheTable;
- for (unsigned i = 0, e = HASH_TABLE_SIZE; i != e; ++i) {
- for (IdentifierBucket *Id = TableArray[i]; Id; Id = Id->Next)
- IV.VisitIdentifier(Id->TokInfo);
+ IdentifierBucket *TableArray = (IdentifierBucket*)TheTable;
+ for (unsigned i = 0, e = HashTableSize; i != e; ++i) {
+ if (IdentifierInfo *Id = TableArray[i].Info)
+ IV.VisitIdentifier(*Id);
}
}
@@ -337,49 +382,37 @@
/// PrintStats - Print statistics about how well the identifier table is doing
/// at hashing identifiers.
void IdentifierTable::PrintStats() const {
- unsigned NumIdentifiers = 0;
unsigned NumEmptyBuckets = 0;
- unsigned MaxBucketLength = 0;
unsigned AverageIdentifierSize = 0;
unsigned MaxIdentifierLength = 0;
+ unsigned NumProbed = 0;
- IdentifierBucket **TableArray = (IdentifierBucket**)TheTable;
- for (unsigned i = 0, e = HASH_TABLE_SIZE; i != e; ++i) {
-
- unsigned NumIdentifiersInBucket = 0;
- for (IdentifierBucket *Id = TableArray[i]; Id; Id = Id->Next) {
- AverageIdentifierSize += Id->TokInfo.getNameLength();
- if (MaxIdentifierLength < Id->TokInfo.getNameLength())
- MaxIdentifierLength = Id->TokInfo.getNameLength();
- ++NumIdentifiersInBucket;
- }
- if (NumIdentifiersInBucket > MaxBucketLength) {
- MaxBucketLength = NumIdentifiersInBucket;
-
-#if 0 // This code can be enabled to see (with -stats) a sample of some of the
- // longest buckets in the hash table. Useful for inspecting density of
- // buckets etc.
- std::cerr << "Bucket length " << MaxBucketLength << ":\n";
- for (IdentifierBucket *Id = TableArray[i]; Id; Id = Id->Next) {
- std::cerr << " " << Id->TokInfo.getName() << " hash = "
- << Id->TokInfo.HashValue << "\n";
- }
-#endif
- }
- if (NumIdentifiersInBucket == 0)
+ IdentifierBucket *TableArray = (IdentifierBucket*)TheTable;
+ for (unsigned i = 0, e = HashTableSize; i != e; ++i) {
+ if (TableArray[i].Info == 0) {
++NumEmptyBuckets;
+ continue;
+ }
+ IdentifierInfo *Id = TableArray[i].Info;
+
+ AverageIdentifierSize += Id->getNameLength();
+ if (MaxIdentifierLength < Id->getNameLength())
+ MaxIdentifierLength = Id->getNameLength();
+
+ // Count the number of times something was probed.
+ if ((TableArray[i].FullHashValue & (e-1)) != i)
+ ++NumProbed;
- NumIdentifiers += NumIdentifiersInBucket;
+ // TODO: Figure out maximum times an identifier had to probe for -stats.
}
std::cerr << "\n*** Identifier Table Stats:\n";
std::cerr << "# Identifiers: " << NumIdentifiers << "\n";
std::cerr << "# Empty Buckets: " << NumEmptyBuckets << "\n";
- std::cerr << "Max identifiers in one bucket: " << MaxBucketLength << "\n";
std::cerr << "Hash density (#identifiers per bucket): "
- << NumIdentifiers/(double)HASH_TABLE_SIZE << "\n";
- std::cerr << "Nonempty hash density (average chain length): "
- << NumIdentifiers/(double)(HASH_TABLE_SIZE-NumEmptyBuckets) << "\n";
+ << NumIdentifiers/(double)HashTableSize << "\n";
+ std::cerr << "Num probed identifiers: " << NumProbed << " ("
+ << NumProbed*100.0/NumIdentifiers << "%)\n";
std::cerr << "Ave identifier length: "
<< (AverageIdentifierSize/(double)NumIdentifiers) << "\n";
std::cerr << "Max identifier length: " << MaxIdentifierLength << "\n";
Modified: cfe/cfe/trunk/include/clang/Lex/IdentifierTable.h
URL: http://llvm.org/viewvc/llvm-project/cfe/cfe/trunk/include/clang/Lex/IdentifierTable.h?rev=39058&r1=39057&r2=39058&view=diff
==============================================================================
--- cfe/cfe/trunk/include/clang/Lex/IdentifierTable.h (original)
+++ cfe/cfe/trunk/include/clang/Lex/IdentifierTable.h Wed Jul 11 11:27:12 2007
@@ -39,7 +39,6 @@
bool IsPoisoned : 1; // True if identifier is poisoned.
bool IsOtherTargetMacro : 1; // True if ident is macro on another target.
void *FETokenInfo; // Managed by the language front-end.
- unsigned HashValue; // The full (non-truncated) hash value.
friend class IdentifierTable;
public:
/// getName - Return the actual string for this identifier. The length of
@@ -121,6 +120,7 @@
class IdentifierTable {
void *TheTable;
void *TheMemory;
+ unsigned HashTableSize;
unsigned NumIdentifiers;
public:
/// IdentifierTable ctor - Create the identifier table, populating it with
@@ -142,6 +142,7 @@
void PrintStats() const;
private:
void AddKeywords(const LangOptions &LangOpts);
+ void RehashTable();
};
} // end namespace llvm
More information about the cfe-commits
mailing list