[llvm-branch-commits] [llvm] [IR2Vec] Refactor vocabulary to use section-based storage (PR #158376)

Mircea Trofin via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Sep 17 07:23:24 PDT 2025


================
@@ -261,55 +262,106 @@ void FlowAwareEmbedder::computeEmbeddings(const BasicBlock &BB) const {
   BBVecMap[&BB] = BBVector;
 }
 
+// ==----------------------------------------------------------------------===//
+// VocabStorage
+//===----------------------------------------------------------------------===//
+
+VocabStorage::VocabStorage(std::vector<std::vector<Embedding>> &&SectionData)
+    : Sections(std::move(SectionData)), TotalSize([&] {
+        assert(!Sections.empty() && "Vocabulary has no sections");
+        assert(!Sections[0].empty() && "First section of vocabulary is empty");
+        // Compute total size across all sections
+        size_t Size = 0;
+        for (const auto &Section : Sections)
+          Size += Section.size();
+        return Size;
+      }()),
+      Dimension([&] {
+        // Get dimension from the first embedding in the first section - all
+        // embeddings must have the same dimension
+        assert(!Sections.empty() && "Vocabulary has no sections");
+        assert(!Sections[0].empty() && "First section of vocabulary is empty");
+        return static_cast<unsigned>(Sections[0][0].size());
+      }()) {}
+
+const Embedding &VocabStorage::const_iterator::operator*() const {
+  assert(SectionId < Storage->Sections.size() && "Invalid section ID");
+  assert(LocalIndex < Storage->Sections[SectionId].size() &&
+         "Local index out of range");
+  return Storage->Sections[SectionId][LocalIndex];
+}
+
+VocabStorage::const_iterator &VocabStorage::const_iterator::operator++() {
+  ++LocalIndex;
+  // Check if we need to move to the next section
+  while (SectionId < Storage->getNumSections() &&
+         LocalIndex >= Storage->Sections[SectionId].size()) {
+    LocalIndex = 0;
+    ++SectionId;
+  }
+  return *this;
+}
+
+bool VocabStorage::const_iterator::operator==(
+    const const_iterator &Other) const {
+  return Storage == Other.Storage && SectionId == Other.SectionId &&
+         LocalIndex == Other.LocalIndex;
+}
+
+bool VocabStorage::const_iterator::operator!=(
+    const const_iterator &Other) const {
+  return !(*this == Other);
+}
+
 // ==----------------------------------------------------------------------===//
 // Vocabulary
 //===----------------------------------------------------------------------===//
 
 unsigned Vocabulary::getDimension() const {
----------------
mtrofin wrote:

you can move trivial accessors to the header. Better performance in non- [Thin]LTO cases.

https://github.com/llvm/llvm-project/pull/158376


More information about the llvm-branch-commits mailing list