[llvm] r303665 - [PDB] Hash types up front when merging types instead of using StringMap
Reid Kleckner via llvm-commits
llvm-commits at lists.llvm.org
Tue May 23 11:23:59 PDT 2017
Author: rnk
Date: Tue May 23 13:23:59 2017
New Revision: 303665
URL: http://llvm.org/viewvc/llvm-project?rev=303665&view=rev
Log:
[PDB] Hash types up front when merging types instead of using StringMap
Summary:
First, StringMap uses llvm::HashString, which is only good for short
identifiers and really bad for large blobs of binary data like type
records. Moving to `DenseMap<StringRef, TypeIndex>` with some tricks for
memory allocation fixes that.
Unfortunately, that didn't buy very much performance. Profiling showed
that we spend a long time during DenseMap growth rehashing existing
entries. Also, in general, DenseMap is faster when the keys are small.
This change takes that to the logical conclusion by introducing a small
wrapper value type around a pointer to key data. The key data contains a
precomputed hash, the original record data (pointer and size), and the
type index, which is the "value" of our original map.
This reduces the time to produce llvm-as.exe and llvm-as.pdb from ~15s
on my machine to 3.5s, which is about a 4x improvement.
Reviewers: zturner, inglorion, ruiu
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D33428
Modified:
llvm/trunk/include/llvm/DebugInfo/CodeView/TypeSerializer.h
llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
llvm/trunk/lib/DebugInfo/CodeView/TypeSerializer.cpp
llvm/trunk/lib/DebugInfo/CodeView/TypeTableCollection.cpp
llvm/trunk/tools/llvm-pdbdump/llvm-pdbdump.cpp
Modified: llvm/trunk/include/llvm/DebugInfo/CodeView/TypeSerializer.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/DebugInfo/CodeView/TypeSerializer.h?rev=303665&r1=303664&r2=303665&view=diff
==============================================================================
--- llvm/trunk/include/llvm/DebugInfo/CodeView/TypeSerializer.h (original)
+++ llvm/trunk/include/llvm/DebugInfo/CodeView/TypeSerializer.h Tue May 23 13:23:59 2017
@@ -17,7 +17,6 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/Error.h"
@@ -26,6 +25,8 @@ namespace llvm {
namespace codeview {
+class TypeHasher;
+
class TypeSerializer : public TypeVisitorCallbacks {
struct SubRecord {
SubRecord(TypeLeafKind K, uint32_t S) : Kind(K), Size(S) {}
@@ -45,14 +46,13 @@ class TypeSerializer : public TypeVisito
}
};
- typedef SmallVector<MutableArrayRef<uint8_t>, 2> RecordList;
+ typedef SmallVector<MutableArrayRef<uint8_t>, 2> MutableRecordList;
static constexpr uint8_t ContinuationLength = 8;
BumpPtrAllocator &RecordStorage;
RecordSegment CurrentSegment;
- RecordList FieldListSegments;
+ MutableRecordList FieldListSegments;
- TypeIndex LastTypeIndex;
Optional<TypeLeafKind> TypeKind;
Optional<TypeLeafKind> MemberKind;
std::vector<uint8_t> RecordBuffer;
@@ -60,28 +60,23 @@ class TypeSerializer : public TypeVisito
BinaryStreamWriter Writer;
TypeRecordMapping Mapping;
- RecordList SeenRecords;
- StringMap<TypeIndex> HashedRecords;
+ /// Private type record hashing implementation details are handled here.
+ std::unique_ptr<TypeHasher> Hasher;
bool isInFieldList() const;
- TypeIndex calcNextTypeIndex() const;
- TypeIndex incrementTypeIndex();
MutableArrayRef<uint8_t> getCurrentSubRecordData();
MutableArrayRef<uint8_t> getCurrentRecordData();
Error writeRecordPrefix(TypeLeafKind Kind);
- TypeIndex insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record);
- TypeIndex insertRecordBytesWithCopy(CVType &Record,
- MutableArrayRef<uint8_t> Data);
Expected<MutableArrayRef<uint8_t>>
addPadding(MutableArrayRef<uint8_t> Record);
public:
explicit TypeSerializer(BumpPtrAllocator &Storage);
+ ~TypeSerializer();
- ArrayRef<MutableArrayRef<uint8_t>> records() const;
- TypeIndex getLastTypeIndex() const;
- TypeIndex insertRecordBytes(MutableArrayRef<uint8_t> Record);
+ ArrayRef<ArrayRef<uint8_t>> records() const;
+ TypeIndex insertRecordBytes(ArrayRef<uint8_t> Record);
Expected<TypeIndex> visitTypeEndGetIndex(CVType &Record);
Error visitTypeBegin(CVType &Record) override;
Modified: llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h?rev=303665&r1=303664&r2=303665&view=diff
==============================================================================
--- llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h (original)
+++ llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h Tue May 23 13:23:59 2017
@@ -64,7 +64,7 @@ public:
return *ExpectedIndex;
}
- TypeIndex writeSerializedRecord(MutableArrayRef<uint8_t> Record) {
+ TypeIndex writeSerializedRecord(ArrayRef<uint8_t> Record) {
return Serializer.insertRecordBytes(Record);
}
@@ -77,9 +77,7 @@ public:
}
}
- ArrayRef<MutableArrayRef<uint8_t>> records() const {
- return Serializer.records();
- }
+ ArrayRef<ArrayRef<uint8_t>> records() const { return Serializer.records(); }
};
class FieldListRecordBuilder {
Modified: llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableCollection.h?rev=303665&r1=303664&r2=303665&view=diff
==============================================================================
--- llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableCollection.h (original)
+++ llvm/trunk/include/llvm/DebugInfo/CodeView/TypeTableCollection.h Tue May 23 13:23:59 2017
@@ -18,7 +18,7 @@ namespace codeview {
class TypeTableCollection : public TypeCollection {
public:
- explicit TypeTableCollection(ArrayRef<MutableArrayRef<uint8_t>> Records);
+ explicit TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records);
Optional<TypeIndex> getFirst() override;
Optional<TypeIndex> getNext(TypeIndex Prev) override;
@@ -33,7 +33,7 @@ private:
bool hasCapacityFor(TypeIndex Index) const;
void ensureTypeExists(TypeIndex Index);
- ArrayRef<MutableArrayRef<uint8_t>> Records;
+ ArrayRef<ArrayRef<uint8_t>> Records;
TypeDatabase Database;
};
}
Modified: llvm/trunk/lib/DebugInfo/CodeView/TypeSerializer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/DebugInfo/CodeView/TypeSerializer.cpp?rev=303665&r1=303664&r2=303665&view=diff
==============================================================================
--- llvm/trunk/lib/DebugInfo/CodeView/TypeSerializer.cpp (original)
+++ llvm/trunk/lib/DebugInfo/CodeView/TypeSerializer.cpp Tue May 23 13:23:59 2017
@@ -9,6 +9,7 @@
#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/BinaryStreamWriter.h"
#include <string.h>
@@ -16,21 +17,116 @@
using namespace llvm;
using namespace llvm::codeview;
-bool TypeSerializer::isInFieldList() const {
- return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
-}
+namespace {
+struct HashedType {
+ uint64_t Hash;
+ const uint8_t *Data;
+ unsigned Size; // FIXME: Go to uint16_t?
+ TypeIndex Index;
+};
+
+/// Wrapper around a poitner to a HashedType. Hash and equality operations are
+/// based on data in the pointee.
+struct HashedTypePtr {
+ HashedTypePtr() = default;
+ HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
+ HashedType *Ptr = nullptr;
+};
+} // namespace
+
+template <> struct DenseMapInfo<HashedTypePtr> {
+ static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
+ static inline HashedTypePtr getTombstoneKey() {
+ return HashedTypePtr(reinterpret_cast<HashedType *>(1));
+ }
+ static unsigned getHashValue(HashedTypePtr Val) {
+ assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
+ return Val.Ptr->Hash;
+ }
+ static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
+ HashedType *LHS = LHSP.Ptr;
+ HashedType *RHS = RHSP.Ptr;
+ if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr)
+ return LHS == RHS;
+ if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size)
+ return false;
+ return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
+ }
+};
+
+/// Private implementation so that we don't leak our DenseMap instantiations to
+/// users.
+class llvm::codeview::TypeHasher {
+private:
+ /// Storage for type record provided by the caller. Records will outlive the
+ /// hasher object, so they should be allocated here.
+ BumpPtrAllocator &RecordStorage;
+
+ /// Storage for hash keys. These only need to live as long as the hashing
+ /// operation.
+ BumpPtrAllocator KeyStorage;
+
+ /// Hash table. We really want a DenseMap<ArrayRef<uint8_t>, TypeIndex> here,
+ /// but DenseMap is inefficient when the keys are long (like type records)
+ /// because it recomputes the hash value of every key when it grows. This
+ /// value type stores the hash out of line in KeyStorage, so that table
+ /// entries are small and easy to rehash.
+ DenseSet<HashedTypePtr> HashedRecords;
+
+ SmallVector<ArrayRef<uint8_t>, 2> SeenRecords;
+
+ TypeIndex NextTypeIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
+
+public:
+ TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {}
+
+ ArrayRef<ArrayRef<uint8_t>> records() const { return SeenRecords; }
+
+ /// Takes the bytes of type record, inserts them into the hash table, saves
+ /// them, and returns a pointer to an identical stable type record along with
+ /// its type index in the destination stream.
+ TypeIndex getOrCreateRecord(ArrayRef<uint8_t> &Record);
+};
+
+TypeIndex TypeHasher::getOrCreateRecord(ArrayRef<uint8_t> &Record) {
+ assert(Record.size() < UINT32_MAX && "Record too big");
+ assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
+
+ // Compute the hash up front so we can store it in the key.
+ HashedType TempHashedType = {hash_value(Record), Record.data(),
+ unsigned(Record.size()), NextTypeIndex};
+
+ auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType));
+ HashedType *&Hashed = Result.first->Ptr;
+
+ if (Result.second) {
+ // This was a new type record. We need stable storage for both the key and
+ // the record. The record should outlive the hashing operation.
+ Hashed = KeyStorage.Allocate<HashedType>();
+ *Hashed = TempHashedType;
+
+ uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
+ memcpy(Stable, Record.data(), Record.size());
+ Hashed->Data = Stable;
+ assert(Hashed->Size == Record.size());
+
+ // This was a new record, so increment our next type index.
+ ++NextTypeIndex;
+ }
+
+ // Update the caller's copy of Record to point a stable copy.
+ Record = ArrayRef<uint8_t>(Hashed->Data, Hashed->Size);
+
+ if (Result.second) {
+ // FIXME: Can we record these in a more efficient way?
+ SeenRecords.push_back(Record);
+ }
-TypeIndex TypeSerializer::calcNextTypeIndex() const {
- if (LastTypeIndex.isNoneType())
- return TypeIndex(TypeIndex::FirstNonSimpleIndex);
- else
- return TypeIndex(LastTypeIndex.getIndex() + 1);
+ return TypeIndex(Hashed->Index);
}
-TypeIndex TypeSerializer::incrementTypeIndex() {
- TypeIndex Previous = LastTypeIndex;
- LastTypeIndex = calcNextTypeIndex();
- return Previous;
+bool TypeSerializer::isInFieldList() const {
+ return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
}
MutableArrayRef<uint8_t> TypeSerializer::getCurrentSubRecordData() {
@@ -51,46 +147,6 @@ Error TypeSerializer::writeRecordPrefix(
return Error::success();
}
-TypeIndex
-TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
- assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
-
- StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
-
- TypeIndex NextTypeIndex = calcNextTypeIndex();
- auto Result = HashedRecords.try_emplace(S, NextTypeIndex);
- if (Result.second) {
- LastTypeIndex = NextTypeIndex;
- SeenRecords.push_back(Record);
- }
- return Result.first->getValue();
-}
-
-TypeIndex
-TypeSerializer::insertRecordBytesWithCopy(CVType &Record,
- MutableArrayRef<uint8_t> Data) {
- assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
-
- StringRef S(reinterpret_cast<const char *>(Data.data()), Data.size());
-
- // Do a two state lookup / insert so that we don't have to allocate unless
- // we're going
- // to do an insert. This is a big memory savings.
- auto Iter = HashedRecords.find(S);
- if (Iter != HashedRecords.end())
- return Iter->second;
-
- LastTypeIndex = calcNextTypeIndex();
- uint8_t *Copy = RecordStorage.Allocate<uint8_t>(Data.size());
- ::memcpy(Copy, Data.data(), Data.size());
- Data = MutableArrayRef<uint8_t>(Copy, Data.size());
- S = StringRef(reinterpret_cast<const char *>(Data.data()), Data.size());
- HashedRecords.insert(std::make_pair(S, LastTypeIndex));
- SeenRecords.push_back(Data);
- Record.RecordData = Data;
- return LastTypeIndex;
-}
-
Expected<MutableArrayRef<uint8_t>>
TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
uint32_t Align = Record.size() % 4;
@@ -109,26 +165,25 @@ TypeSerializer::addPadding(MutableArrayR
}
TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
- : RecordStorage(Storage), LastTypeIndex(),
- RecordBuffer(MaxRecordLength * 2),
+ : RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
Stream(RecordBuffer, llvm::support::little), Writer(Stream),
- Mapping(Writer) {
+ Mapping(Writer), Hasher(make_unique<TypeHasher>(Storage)) {
// RecordBuffer needs to be able to hold enough data so that if we are 1
// byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
// we won't overflow.
}
-ArrayRef<MutableArrayRef<uint8_t>> TypeSerializer::records() const {
- return SeenRecords;
-}
+TypeSerializer::~TypeSerializer() = default;
-TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; }
+ArrayRef<ArrayRef<uint8_t>> TypeSerializer::records() const {
+ return Hasher->records();
+}
-TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef<uint8_t> Record) {
+TypeIndex TypeSerializer::insertRecordBytes(ArrayRef<uint8_t> Record) {
assert(!TypeKind.hasValue() && "Already in a type mapping!");
assert(Writer.getOffset() == 0 && "Stream has data already!");
- return insertRecordBytesPrivate(Record);
+ return Hasher->getOrCreateRecord(Record);
}
Error TypeSerializer::visitTypeBegin(CVType &Record) {
@@ -163,8 +218,8 @@ Expected<TypeIndex> TypeSerializer::visi
Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);
Record.Type = *TypeKind;
- TypeIndex InsertedTypeIndex =
- insertRecordBytesWithCopy(Record, ThisRecordData);
+ Record.RecordData = ThisRecordData;
+ TypeIndex InsertedTypeIndex = Hasher->getOrCreateRecord(Record.RecordData);
// Write out each additional segment in reverse order, and update each
// record's continuation index to point to the previous one.
@@ -174,7 +229,7 @@ Expected<TypeIndex> TypeSerializer::visi
reinterpret_cast<support::ulittle32_t *>(CIBytes.data());
assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder");
*CI = InsertedTypeIndex.getIndex();
- InsertedTypeIndex = insertRecordBytesPrivate(X);
+ InsertedTypeIndex = Hasher->getOrCreateRecord(X);
}
TypeKind.reset();
Modified: llvm/trunk/lib/DebugInfo/CodeView/TypeTableCollection.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/DebugInfo/CodeView/TypeTableCollection.cpp?rev=303665&r1=303664&r2=303665&view=diff
==============================================================================
--- llvm/trunk/lib/DebugInfo/CodeView/TypeTableCollection.cpp (original)
+++ llvm/trunk/lib/DebugInfo/CodeView/TypeTableCollection.cpp Tue May 23 13:23:59 2017
@@ -24,8 +24,7 @@ static void error(Error &&EC) {
consumeError(std::move(EC));
}
-TypeTableCollection::TypeTableCollection(
- ArrayRef<MutableArrayRef<uint8_t>> Records)
+TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
: Records(Records), Database(Records.size()) {}
Optional<TypeIndex> TypeTableCollection::getFirst() {
Modified: llvm/trunk/tools/llvm-pdbdump/llvm-pdbdump.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/tools/llvm-pdbdump/llvm-pdbdump.cpp?rev=303665&r1=303664&r2=303665&view=diff
==============================================================================
--- llvm/trunk/tools/llvm-pdbdump/llvm-pdbdump.cpp (original)
+++ llvm/trunk/tools/llvm-pdbdump/llvm-pdbdump.cpp Tue May 23 13:23:59 2017
@@ -877,14 +877,12 @@ static void mergePdbs() {
auto &DestTpi = Builder.getTpiBuilder();
auto &DestIpi = Builder.getIpiBuilder();
- MergedTpi.ForEachRecord(
- [&DestTpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
- DestTpi.addTypeRecord(Data, None);
- });
- MergedIpi.ForEachRecord(
- [&DestIpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
- DestIpi.addTypeRecord(Data, None);
- });
+ MergedTpi.ForEachRecord([&DestTpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
+ DestTpi.addTypeRecord(Data, None);
+ });
+ MergedIpi.ForEachRecord([&DestIpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
+ DestIpi.addTypeRecord(Data, None);
+ });
SmallString<64> OutFile(opts::merge::PdbOutputFile);
if (OutFile.empty()) {
More information about the llvm-commits
mailing list