[llvm] [NFC][IR2Vec][MIR2Vec] llvm-ir2vec refactor - move Tool class implementations into a separate utils file (PR #174133)

Nishant Sachdeva via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 11 03:18:29 PST 2026


https://github.com/nishant-sachdeva updated https://github.com/llvm/llvm-project/pull/174133

>From 70a61d64925fe550488857ba1f017e67c535417e Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Sun, 21 Dec 2025 01:25:33 +0530
Subject: [PATCH 01/13] Work Commit - Separating all tool implementation from
 cli file

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt         |   1 +
 llvm/tools/llvm-ir2vec/emb-tool.cpp           | 421 ++++++++++++++++++
 .../llvm-ir2vec/{llvm-ir2vec.h => emb-tool.h} |   8 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp        | 372 +---------------
 4 files changed, 427 insertions(+), 375 deletions(-)
 create mode 100644 llvm/tools/llvm-ir2vec/emb-tool.cpp
 rename llvm/tools/llvm-ir2vec/{llvm-ir2vec.h => emb-tool.h} (98%)

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index 2bb6686392907..9d5db8663fb38 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -19,6 +19,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_tool(llvm-ir2vec
   llvm-ir2vec.cpp
+  emb-tool.cpp
   
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/llvm-ir2vec/emb-tool.cpp b/llvm/tools/llvm-ir2vec/emb-tool.cpp
new file mode 100644
index 0000000000000..891b26f8ef763
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/emb-tool.cpp
@@ -0,0 +1,421 @@
+//===- emb-tool.cpp - IR2Vec/MIR2Vec Embedding Generation Tool -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the IR2VecTool and MIR2VecTool classes for
+/// IR2Vec/MIR2Vec embedding generation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "emb-tool.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "ir2vec"
+
+namespace llvm {
+
+namespace ir2vec {
+
+bool IR2VecTool::initializeVocabulary() {
+  // Register and run the IR2Vec vocabulary analysis
+  // The vocabulary file path is specified via --ir2vec-vocab-path global
+  // option
+  MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
+  MAM.registerPass([&] { return IR2VecVocabAnalysis(); });
+  // This will throw an error if vocab is not found or invalid
+  Vocab = &MAM.getResult<IR2VecVocabAnalysis>(M);
+  return Vocab->isValid();
+}
+
+TripletResult IR2VecTool::generateTriplets(const Function &F) const {
+  if (F.isDeclaration())
+    return {};
+
+  TripletResult Result;
+  Result.MaxRelation = 0;
+
+  unsigned MaxRelation = NextRelation;
+  unsigned PrevOpcode = 0;
+  bool HasPrevOpcode = false;
+
+  for (const BasicBlock &BB : F) {
+    for (const auto &I : BB.instructionsWithoutDebug()) {
+      unsigned Opcode = Vocabulary::getIndex(I.getOpcode());
+      unsigned TypeID = Vocabulary::getIndex(I.getType()->getTypeID());
+
+      // Add "Next" relationship with previous instruction
+      if (HasPrevOpcode) {
+        Result.Triplets.push_back({PrevOpcode, Opcode, NextRelation});
+        LLVM_DEBUG(dbgs() << Vocabulary::getVocabKeyForOpcode(PrevOpcode + 1)
+                          << '\t'
+                          << Vocabulary::getVocabKeyForOpcode(Opcode + 1)
+                          << '\t' << "Next\n");
+      }
+
+      // Add "Type" relationship
+      Result.Triplets.push_back({Opcode, TypeID, TypeRelation});
+      LLVM_DEBUG(
+          dbgs() << Vocabulary::getVocabKeyForOpcode(Opcode + 1) << '\t'
+                 << Vocabulary::getVocabKeyForTypeID(I.getType()->getTypeID())
+                 << '\t' << "Type\n");
+
+      // Add "Arg" relationships
+      unsigned ArgIndex = 0;
+      for (const Use &U : I.operands()) {
+        unsigned OperandID = Vocabulary::getIndex(*U.get());
+        unsigned RelationID = ArgRelation + ArgIndex;
+        Result.Triplets.push_back({Opcode, OperandID, RelationID});
+
+        LLVM_DEBUG({
+          StringRef OperandStr = Vocabulary::getVocabKeyForOperandKind(
+              Vocabulary::getOperandKind(U.get()));
+          dbgs() << Vocabulary::getVocabKeyForOpcode(Opcode + 1) << '\t'
+                 << OperandStr << '\t' << "Arg" << ArgIndex << '\n';
+        });
+
+        ++ArgIndex;
+      }
+      // Only update MaxRelation if there were operands
+      if (ArgIndex > 0)
+        MaxRelation = std::max(MaxRelation, ArgRelation + ArgIndex - 1);
+      PrevOpcode = Opcode;
+      HasPrevOpcode = true;
+    }
+  }
+
+  Result.MaxRelation = MaxRelation;
+  return Result;
+}
+
+TripletResult IR2VecTool::generateTriplets() const {
+  TripletResult Result;
+  Result.MaxRelation = NextRelation;
+
+  for (const Function &F : M.getFunctionDefs()) {
+    TripletResult FuncResult = generateTriplets(F);
+    Result.MaxRelation = std::max(Result.MaxRelation, FuncResult.MaxRelation);
+    Result.Triplets.insert(Result.Triplets.end(), FuncResult.Triplets.begin(),
+                           FuncResult.Triplets.end());
+  }
+
+  return Result;
+}
+
+void IR2VecTool::writeTripletsToStream(raw_ostream &OS) const {
+  auto Result = generateTriplets();
+  OS << "MAX_RELATION=" << Result.MaxRelation << '\n';
+  for (const auto &T : Result.Triplets)
+    OS << T.Head << '\t' << T.Tail << '\t' << T.Relation << '\n';
+}
+
+EntityList IR2VecTool::collectEntityMappings() {
+  auto EntityLen = Vocabulary::getCanonicalSize();
+  EntityList Result;
+  for (unsigned EntityID = 0; EntityID < EntityLen; ++EntityID)
+    Result.push_back(Vocabulary::getStringKey(EntityID).str());
+  return Result;
+}
+
+void IR2VecTool::writeEntitiesToStream(raw_ostream &OS) {
+  auto Entities = collectEntityMappings();
+  OS << Entities.size() << "\n";
+  for (unsigned EntityID = 0; EntityID < Entities.size(); ++EntityID)
+    OS << Entities[EntityID] << '\t' << EntityID << '\n';
+}
+
+void IR2VecTool::writeEmbeddingsToStream(raw_ostream &OS,
+                                         EmbeddingLevel Level) const {
+  if (!Vocab->isValid()) {
+    WithColor::error(errs(), ToolName)
+        << "Vocabulary is not valid. IR2VecTool not initialized.\n";
+    return;
+  }
+
+  for (const Function &F : M.getFunctionDefs())
+    writeEmbeddingsToStream(F, OS, Level);
+}
+
+void IR2VecTool::writeEmbeddingsToStream(const Function &F, raw_ostream &OS,
+                                         EmbeddingLevel Level) const {
+  if (!Vocab || !Vocab->isValid()) {
+    WithColor::error(errs(), ToolName)
+        << "Vocabulary is not valid. IR2VecTool not initialized.\n";
+    return;
+  }
+  if (F.isDeclaration()) {
+    OS << "Function " << F.getName() << " is a declaration, skipping.\n";
+    return;
+  }
+
+  // Create embedder for this function
+  auto Emb = Embedder::create(IR2VecEmbeddingKind, F, *Vocab);
+  if (!Emb) {
+    WithColor::error(errs(), ToolName)
+        << "Failed to create embedder for function " << F.getName() << "\n";
+    return;
+  }
+
+  OS << "Function: " << F.getName() << "\n";
+
+  // Generate embeddings based on the specified level
+  switch (Level) {
+  case FunctionLevel:
+    Emb->getFunctionVector().print(OS);
+    break;
+  case BasicBlockLevel:
+    for (const BasicBlock &BB : F) {
+      OS << BB.getName() << ":";
+      Emb->getBBVector(BB).print(OS);
+    }
+    break;
+  case InstructionLevel:
+    for (const Instruction &I : instructions(F)) {
+      OS << I;
+      Emb->getInstVector(I).print(OS);
+    }
+    break;
+  }
+}
+
+} // namespace ir2vec
+
+namespace mir2vec {
+
+bool MIR2VecTool::initializeVocabulary(const Module &M) {
+  MIR2VecVocabProvider Provider(MMI);
+  auto VocabOrErr = Provider.getVocabulary(M);
+  if (!VocabOrErr) {
+    WithColor::error(errs(), ToolName)
+        << "Failed to load MIR2Vec vocabulary - "
+        << toString(VocabOrErr.takeError()) << "\n";
+    return false;
+  }
+  Vocab = std::make_unique<MIRVocabulary>(std::move(*VocabOrErr));
+  return true;
+}
+
+bool MIR2VecTool::initializeVocabularyForLayout(const Module &M) {
+  for (const Function &F : M.getFunctionDefs()) {
+    MachineFunction *MF = MMI.getMachineFunction(F);
+    if (!MF)
+      continue;
+
+    const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+    const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+    const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(TII, TRI, MRI, 1);
+    if (!VocabOrErr) {
+      WithColor::error(errs(), ToolName)
+          << "Failed to create dummy vocabulary - "
+          << toString(VocabOrErr.takeError()) << "\n";
+      return false;
+    }
+    Vocab = std::make_unique<MIRVocabulary>(std::move(*VocabOrErr));
+    return true;
+  }
+
+  WithColor::error(errs(), ToolName)
+      << "No machine functions found to initialize vocabulary\n";
+  return false;
+}
+
+TripletResult MIR2VecTool::generateTriplets(const MachineFunction &MF) const {
+  TripletResult Result;
+  Result.MaxRelation = MIRNextRelation;
+
+  if (!Vocab) {
+    WithColor::error(errs(), ToolName)
+        << "MIR Vocabulary must be initialized for triplet generation.\n";
+    return Result;
+  }
+
+  unsigned PrevOpcode = 0;
+  bool HasPrevOpcode = false;
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      // Skip debug instructions
+      if (MI.isDebugInstr())
+        continue;
+
+      // Get opcode entity ID
+      unsigned OpcodeID = Vocab->getEntityIDForOpcode(MI.getOpcode());
+
+      // Add "Next" relationship with previous instruction
+      if (HasPrevOpcode) {
+        Result.Triplets.push_back({PrevOpcode, OpcodeID, MIRNextRelation});
+        LLVM_DEBUG(dbgs() << Vocab->getStringKey(PrevOpcode) << '\t'
+                          << Vocab->getStringKey(OpcodeID) << '\t' << "Next\n");
+      }
+
+      // Add "Arg" relationships for operands
+      unsigned ArgIndex = 0;
+      for (const MachineOperand &MO : MI.operands()) {
+        auto OperandID = Vocab->getEntityIDForMachineOperand(MO);
+        unsigned RelationID = MIRArgRelation + ArgIndex;
+        Result.Triplets.push_back({OpcodeID, OperandID, RelationID});
+        LLVM_DEBUG({
+          std::string OperandStr = Vocab->getStringKey(OperandID);
+          dbgs() << Vocab->getStringKey(OpcodeID) << '\t' << OperandStr << '\t'
+                 << "Arg" << ArgIndex << '\n';
+        });
+
+        ++ArgIndex;
+      }
+
+      // Update MaxRelation if there were operands
+      if (ArgIndex > 0)
+        Result.MaxRelation =
+            std::max(Result.MaxRelation, MIRArgRelation + ArgIndex - 1);
+
+      PrevOpcode = OpcodeID;
+      HasPrevOpcode = true;
+    }
+  }
+
+  return Result;
+}
+
+TripletResult MIR2VecTool::generateTriplets(const Module &M) const {
+  TripletResult Result;
+  Result.MaxRelation = MIRNextRelation;
+
+  for (const Function &F : M.getFunctionDefs()) {
+    MachineFunction *MF = MMI.getMachineFunction(F);
+    if (!MF) {
+      WithColor::warning(errs(), ToolName)
+          << "No MachineFunction for " << F.getName() << "\n";
+      continue;
+    }
+
+    TripletResult FuncResult = generateTriplets(*MF);
+    Result.MaxRelation = std::max(Result.MaxRelation, FuncResult.MaxRelation);
+    Result.Triplets.insert(Result.Triplets.end(), FuncResult.Triplets.begin(),
+                           FuncResult.Triplets.end());
+  }
+
+  return Result;
+}
+
+void MIR2VecTool::writeTripletsToStream(const Module &M,
+                                        raw_ostream &OS) const {
+  auto Result = generateTriplets(M);
+  OS << "MAX_RELATION=" << Result.MaxRelation << '\n';
+  for (const auto &T : Result.Triplets)
+    OS << T.Head << '\t' << T.Tail << '\t' << T.Relation << '\n';
+}
+
+EntityList MIR2VecTool::collectEntityMappings() const {
+  if (!Vocab) {
+    WithColor::error(errs(), ToolName)
+        << "Vocabulary must be initialized for entity mappings.\n";
+    return {};
+  }
+
+  const unsigned EntityCount = Vocab->getCanonicalSize();
+  EntityList Result;
+  for (unsigned EntityID = 0; EntityID < EntityCount; ++EntityID)
+    Result.push_back(Vocab->getStringKey(EntityID));
+
+  return Result;
+}
+
+void MIR2VecTool::writeEntitiesToStream(raw_ostream &OS) const {
+  auto Entities = collectEntityMappings();
+  if (Entities.empty())
+    return;
+
+  OS << Entities.size() << "\n";
+  for (unsigned EntityID = 0; EntityID < Entities.size(); ++EntityID)
+    OS << Entities[EntityID] << '\t' << EntityID << '\n';
+}
+
+void MIR2VecTool::writeEmbeddingsToStream(const Module &M, raw_ostream &OS,
+                                          EmbeddingLevel Level) const {
+  if (!Vocab) {
+    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
+    return;
+  }
+
+  for (const Function &F : M.getFunctionDefs()) {
+    MachineFunction *MF = MMI.getMachineFunction(F);
+    if (!MF) {
+      WithColor::warning(errs(), ToolName)
+          << "No MachineFunction for " << F.getName() << "\n";
+      continue;
+    }
+
+    writeEmbeddingsToStream(*MF, OS, Level);
+  }
+}
+
+void MIR2VecTool::writeEmbeddingsToStream(MachineFunction &MF, raw_ostream &OS,
+                                          EmbeddingLevel Level) const {
+  if (!Vocab) {
+    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
+    return;
+  }
+
+  auto Emb = MIREmbedder::create(MIR2VecKind::Symbolic, MF, *Vocab);
+  if (!Emb) {
+    WithColor::error(errs(), ToolName)
+        << "Failed to create embedder for " << MF.getName() << "\n";
+    return;
+  }
+
+  OS << "MIR2Vec embeddings for machine function " << MF.getName() << ":\n";
+
+  // Generate embeddings based on the specified level
+  switch (Level) {
+  case FunctionLevel:
+    OS << "Function vector: ";
+    Emb->getMFunctionVector().print(OS);
+    break;
+  case BasicBlockLevel:
+    OS << "Basic block vectors:\n";
+    for (const MachineBasicBlock &MBB : MF) {
+      OS << "MBB " << MBB.getName() << ": ";
+      Emb->getMBBVector(MBB).print(OS);
+    }
+    break;
+  case InstructionLevel:
+    OS << "Instruction vectors:\n";
+    for (const MachineBasicBlock &MBB : MF) {
+      for (const MachineInstr &MI : MBB) {
+        OS << MI << " -> ";
+        Emb->getMInstVector(MI).print(OS);
+      }
+    }
+    break;
+  }
+}
+
+} // namespace mir2vec
+
+} // namespace llvm
\ No newline at end of file
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.h b/llvm/tools/llvm-ir2vec/emb-tool.h
similarity index 98%
rename from llvm/tools/llvm-ir2vec/llvm-ir2vec.h
rename to llvm/tools/llvm-ir2vec/emb-tool.h
index 566c362edbd22..009bcec60108b 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.h
+++ b/llvm/tools/llvm-ir2vec/emb-tool.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_IR2VEC_LLVM_IR2VEC_H
-#define LLVM_TOOLS_LLVM_IR2VEC_LLVM_IR2VEC_H
+#ifndef LLVM_TOOLS_LLVM_IR2VEC_EMB_TOOL_H
+#define LLVM_TOOLS_LLVM_IR2VEC_EMB_TOOL_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/IR2Vec.h"
@@ -44,7 +44,7 @@
 #define DEBUG_TYPE "ir2vec"
 
 namespace llvm {
-
+  
 /// Tool name for error reporting
 static const char *ToolName = "llvm-ir2vec";
 
@@ -198,4 +198,4 @@ struct MIRContext {
 
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_IR2VEC_LLVM_IR2VEC_H
\ No newline at end of file
+#endif // LLVM_TOOLS_LLVM_IR2VEC_EMB_TOOL_H
\ No newline at end of file
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 6b70e09518fa7..a2b2f4e6a7aa8 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -54,7 +54,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm-ir2vec.h"
+#include "emb-tool.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/IR2Vec.h"
 #include "llvm/IR/BasicBlock.h"
@@ -147,167 +147,6 @@ static cl::opt<EmbeddingLevel>
 
 namespace ir2vec {
 
-bool IR2VecTool::initializeVocabulary() {
-  // Register and run the IR2Vec vocabulary analysis
-  // The vocabulary file path is specified via --ir2vec-vocab-path global
-  // option
-  MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
-  MAM.registerPass([&] { return IR2VecVocabAnalysis(); });
-  // This will throw an error if vocab is not found or invalid
-  Vocab = &MAM.getResult<IR2VecVocabAnalysis>(M);
-  return Vocab->isValid();
-}
-
-TripletResult IR2VecTool::generateTriplets(const Function &F) const {
-  if (F.isDeclaration())
-    return {};
-
-  TripletResult Result;
-  Result.MaxRelation = 0;
-
-  unsigned MaxRelation = NextRelation;
-  unsigned PrevOpcode = 0;
-  bool HasPrevOpcode = false;
-
-  for (const BasicBlock &BB : F) {
-    for (const auto &I : BB.instructionsWithoutDebug()) {
-      unsigned Opcode = Vocabulary::getIndex(I.getOpcode());
-      unsigned TypeID = Vocabulary::getIndex(I.getType()->getTypeID());
-
-      // Add "Next" relationship with previous instruction
-      if (HasPrevOpcode) {
-        Result.Triplets.push_back({PrevOpcode, Opcode, NextRelation});
-        LLVM_DEBUG(dbgs() << Vocabulary::getVocabKeyForOpcode(PrevOpcode + 1)
-                          << '\t'
-                          << Vocabulary::getVocabKeyForOpcode(Opcode + 1)
-                          << '\t' << "Next\n");
-      }
-
-      // Add "Type" relationship
-      Result.Triplets.push_back({Opcode, TypeID, TypeRelation});
-      LLVM_DEBUG(
-          dbgs() << Vocabulary::getVocabKeyForOpcode(Opcode + 1) << '\t'
-                 << Vocabulary::getVocabKeyForTypeID(I.getType()->getTypeID())
-                 << '\t' << "Type\n");
-
-      // Add "Arg" relationships
-      unsigned ArgIndex = 0;
-      for (const Use &U : I.operands()) {
-        unsigned OperandID = Vocabulary::getIndex(*U.get());
-        unsigned RelationID = ArgRelation + ArgIndex;
-        Result.Triplets.push_back({Opcode, OperandID, RelationID});
-
-        LLVM_DEBUG({
-          StringRef OperandStr = Vocabulary::getVocabKeyForOperandKind(
-              Vocabulary::getOperandKind(U.get()));
-          dbgs() << Vocabulary::getVocabKeyForOpcode(Opcode + 1) << '\t'
-                 << OperandStr << '\t' << "Arg" << ArgIndex << '\n';
-        });
-
-        ++ArgIndex;
-      }
-      // Only update MaxRelation if there were operands
-      if (ArgIndex > 0)
-        MaxRelation = std::max(MaxRelation, ArgRelation + ArgIndex - 1);
-      PrevOpcode = Opcode;
-      HasPrevOpcode = true;
-    }
-  }
-
-  Result.MaxRelation = MaxRelation;
-  return Result;
-}
-
-TripletResult IR2VecTool::generateTriplets() const {
-  TripletResult Result;
-  Result.MaxRelation = NextRelation;
-
-  for (const Function &F : M.getFunctionDefs()) {
-    TripletResult FuncResult = generateTriplets(F);
-    Result.MaxRelation = std::max(Result.MaxRelation, FuncResult.MaxRelation);
-    Result.Triplets.insert(Result.Triplets.end(), FuncResult.Triplets.begin(),
-                           FuncResult.Triplets.end());
-  }
-
-  return Result;
-}
-
-void IR2VecTool::writeTripletsToStream(raw_ostream &OS) const {
-  auto Result = generateTriplets();
-  OS << "MAX_RELATION=" << Result.MaxRelation << '\n';
-  for (const auto &T : Result.Triplets)
-    OS << T.Head << '\t' << T.Tail << '\t' << T.Relation << '\n';
-}
-
-EntityList IR2VecTool::collectEntityMappings() {
-  auto EntityLen = Vocabulary::getCanonicalSize();
-  EntityList Result;
-  for (unsigned EntityID = 0; EntityID < EntityLen; ++EntityID)
-    Result.push_back(Vocabulary::getStringKey(EntityID).str());
-  return Result;
-}
-
-void IR2VecTool::writeEntitiesToStream(raw_ostream &OS) {
-  auto Entities = collectEntityMappings();
-  OS << Entities.size() << "\n";
-  for (unsigned EntityID = 0; EntityID < Entities.size(); ++EntityID)
-    OS << Entities[EntityID] << '\t' << EntityID << '\n';
-}
-
-void IR2VecTool::writeEmbeddingsToStream(raw_ostream &OS,
-                                         EmbeddingLevel Level) const {
-  if (!Vocab->isValid()) {
-    WithColor::error(errs(), ToolName)
-        << "Vocabulary is not valid. IR2VecTool not initialized.\n";
-    return;
-  }
-
-  for (const Function &F : M.getFunctionDefs())
-    writeEmbeddingsToStream(F, OS, Level);
-}
-
-void IR2VecTool::writeEmbeddingsToStream(const Function &F, raw_ostream &OS,
-                                         EmbeddingLevel Level) const {
-  if (!Vocab || !Vocab->isValid()) {
-    WithColor::error(errs(), ToolName)
-        << "Vocabulary is not valid. IR2VecTool not initialized.\n";
-    return;
-  }
-  if (F.isDeclaration()) {
-    OS << "Function " << F.getName() << " is a declaration, skipping.\n";
-    return;
-  }
-
-  // Create embedder for this function
-  auto Emb = Embedder::create(IR2VecEmbeddingKind, F, *Vocab);
-  if (!Emb) {
-    WithColor::error(errs(), ToolName)
-        << "Failed to create embedder for function " << F.getName() << "\n";
-    return;
-  }
-
-  OS << "Function: " << F.getName() << "\n";
-
-  // Generate embeddings based on the specified level
-  switch (Level) {
-  case FunctionLevel:
-    Emb->getFunctionVector().print(OS);
-    break;
-  case BasicBlockLevel:
-    for (const BasicBlock &BB : F) {
-      OS << BB.getName() << ":";
-      Emb->getBBVector(BB).print(OS);
-    }
-    break;
-  case InstructionLevel:
-    for (const Instruction &I : instructions(F)) {
-      OS << I;
-      Emb->getInstVector(I).print(OS);
-    }
-    break;
-  }
-}
-
 /// Process the module and generate output based on selected subcommand
 Error processModule(Module &M, raw_ostream &OS) {
   IR2VecTool Tool(M);
@@ -341,215 +180,6 @@ Error processModule(Module &M, raw_ostream &OS) {
 
 namespace mir2vec {
 
-bool MIR2VecTool::initializeVocabulary(const Module &M) {
-  MIR2VecVocabProvider Provider(MMI);
-  auto VocabOrErr = Provider.getVocabulary(M);
-  if (!VocabOrErr) {
-    WithColor::error(errs(), ToolName)
-        << "Failed to load MIR2Vec vocabulary - "
-        << toString(VocabOrErr.takeError()) << "\n";
-    return false;
-  }
-  Vocab = std::make_unique<MIRVocabulary>(std::move(*VocabOrErr));
-  return true;
-}
-
-bool MIR2VecTool::initializeVocabularyForLayout(const Module &M) {
-  for (const Function &F : M.getFunctionDefs()) {
-    MachineFunction *MF = MMI.getMachineFunction(F);
-    if (!MF)
-      continue;
-
-    const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
-    const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
-    const MachineRegisterInfo &MRI = MF->getRegInfo();
-
-    auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(TII, TRI, MRI, 1);
-    if (!VocabOrErr) {
-      WithColor::error(errs(), ToolName)
-          << "Failed to create dummy vocabulary - "
-          << toString(VocabOrErr.takeError()) << "\n";
-      return false;
-    }
-    Vocab = std::make_unique<MIRVocabulary>(std::move(*VocabOrErr));
-    return true;
-  }
-
-  WithColor::error(errs(), ToolName)
-      << "No machine functions found to initialize vocabulary\n";
-  return false;
-}
-
-TripletResult MIR2VecTool::generateTriplets(const MachineFunction &MF) const {
-  TripletResult Result;
-  Result.MaxRelation = MIRNextRelation;
-
-  if (!Vocab) {
-    WithColor::error(errs(), ToolName)
-        << "MIR Vocabulary must be initialized for triplet generation.\n";
-    return Result;
-  }
-
-  unsigned PrevOpcode = 0;
-  bool HasPrevOpcode = false;
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      // Skip debug instructions
-      if (MI.isDebugInstr())
-        continue;
-
-      // Get opcode entity ID
-      unsigned OpcodeID = Vocab->getEntityIDForOpcode(MI.getOpcode());
-
-      // Add "Next" relationship with previous instruction
-      if (HasPrevOpcode) {
-        Result.Triplets.push_back({PrevOpcode, OpcodeID, MIRNextRelation});
-        LLVM_DEBUG(dbgs() << Vocab->getStringKey(PrevOpcode) << '\t'
-                          << Vocab->getStringKey(OpcodeID) << '\t' << "Next\n");
-      }
-
-      // Add "Arg" relationships for operands
-      unsigned ArgIndex = 0;
-      for (const MachineOperand &MO : MI.operands()) {
-        auto OperandID = Vocab->getEntityIDForMachineOperand(MO);
-        unsigned RelationID = MIRArgRelation + ArgIndex;
-        Result.Triplets.push_back({OpcodeID, OperandID, RelationID});
-        LLVM_DEBUG({
-          std::string OperandStr = Vocab->getStringKey(OperandID);
-          dbgs() << Vocab->getStringKey(OpcodeID) << '\t' << OperandStr << '\t'
-                 << "Arg" << ArgIndex << '\n';
-        });
-
-        ++ArgIndex;
-      }
-
-      // Update MaxRelation if there were operands
-      if (ArgIndex > 0)
-        Result.MaxRelation =
-            std::max(Result.MaxRelation, MIRArgRelation + ArgIndex - 1);
-
-      PrevOpcode = OpcodeID;
-      HasPrevOpcode = true;
-    }
-  }
-
-  return Result;
-}
-
-TripletResult MIR2VecTool::generateTriplets(const Module &M) const {
-  TripletResult Result;
-  Result.MaxRelation = MIRNextRelation;
-
-  for (const Function &F : M.getFunctionDefs()) {
-    MachineFunction *MF = MMI.getMachineFunction(F);
-    if (!MF) {
-      WithColor::warning(errs(), ToolName)
-          << "No MachineFunction for " << F.getName() << "\n";
-      continue;
-    }
-
-    TripletResult FuncResult = generateTriplets(*MF);
-    Result.MaxRelation = std::max(Result.MaxRelation, FuncResult.MaxRelation);
-    Result.Triplets.insert(Result.Triplets.end(), FuncResult.Triplets.begin(),
-                           FuncResult.Triplets.end());
-  }
-
-  return Result;
-}
-
-void MIR2VecTool::writeTripletsToStream(const Module &M,
-                                        raw_ostream &OS) const {
-  auto Result = generateTriplets(M);
-  OS << "MAX_RELATION=" << Result.MaxRelation << '\n';
-  for (const auto &T : Result.Triplets)
-    OS << T.Head << '\t' << T.Tail << '\t' << T.Relation << '\n';
-}
-
-EntityList MIR2VecTool::collectEntityMappings() const {
-  if (!Vocab) {
-    WithColor::error(errs(), ToolName)
-        << "Vocabulary must be initialized for entity mappings.\n";
-    return {};
-  }
-
-  const unsigned EntityCount = Vocab->getCanonicalSize();
-  EntityList Result;
-  for (unsigned EntityID = 0; EntityID < EntityCount; ++EntityID)
-    Result.push_back(Vocab->getStringKey(EntityID));
-
-  return Result;
-}
-
-void MIR2VecTool::writeEntitiesToStream(raw_ostream &OS) const {
-  auto Entities = collectEntityMappings();
-  if (Entities.empty())
-    return;
-
-  OS << Entities.size() << "\n";
-  for (unsigned EntityID = 0; EntityID < Entities.size(); ++EntityID)
-    OS << Entities[EntityID] << '\t' << EntityID << '\n';
-}
-
-void MIR2VecTool::writeEmbeddingsToStream(const Module &M, raw_ostream &OS,
-                                          EmbeddingLevel Level) const {
-  if (!Vocab) {
-    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
-    return;
-  }
-
-  for (const Function &F : M.getFunctionDefs()) {
-    MachineFunction *MF = MMI.getMachineFunction(F);
-    if (!MF) {
-      WithColor::warning(errs(), ToolName)
-          << "No MachineFunction for " << F.getName() << "\n";
-      continue;
-    }
-
-    writeEmbeddingsToStream(*MF, OS, Level);
-  }
-}
-
-void MIR2VecTool::writeEmbeddingsToStream(MachineFunction &MF, raw_ostream &OS,
-                                          EmbeddingLevel Level) const {
-  if (!Vocab) {
-    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
-    return;
-  }
-
-  auto Emb = MIREmbedder::create(MIR2VecKind::Symbolic, MF, *Vocab);
-  if (!Emb) {
-    WithColor::error(errs(), ToolName)
-        << "Failed to create embedder for " << MF.getName() << "\n";
-    return;
-  }
-
-  OS << "MIR2Vec embeddings for machine function " << MF.getName() << ":\n";
-
-  // Generate embeddings based on the specified level
-  switch (Level) {
-  case FunctionLevel:
-    OS << "Function vector: ";
-    Emb->getMFunctionVector().print(OS);
-    break;
-  case BasicBlockLevel:
-    OS << "Basic block vectors:\n";
-    for (const MachineBasicBlock &MBB : MF) {
-      OS << "MBB " << MBB.getName() << ": ";
-      Emb->getMBBVector(MBB).print(OS);
-    }
-    break;
-  case InstructionLevel:
-    OS << "Instruction vectors:\n";
-    for (const MachineBasicBlock &MBB : MF) {
-      for (const MachineInstr &MI : MBB) {
-        OS << MI << " -> ";
-        Emb->getMInstVector(MI).print(OS);
-      }
-    }
-    break;
-  }
-}
-
 /// Setup MIR context from input file
 Error setupMIRContext(const std::string &InputFile, MIRContext &Ctx) {
   SMDiagnostic Err;

>From 8662053c20ace235f374ee60b51a18f07c4ab8fe Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Sun, 21 Dec 2025 22:32:35 +0530
Subject: [PATCH 02/13] nit commit - code formatting fixup

---
 llvm/tools/llvm-ir2vec/emb-tool.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-ir2vec/emb-tool.h b/llvm/tools/llvm-ir2vec/emb-tool.h
index 009bcec60108b..24b8022ed7482 100644
--- a/llvm/tools/llvm-ir2vec/emb-tool.h
+++ b/llvm/tools/llvm-ir2vec/emb-tool.h
@@ -44,7 +44,7 @@
 #define DEBUG_TYPE "ir2vec"
 
 namespace llvm {
-  
+
 /// Tool name for error reporting
 static const char *ToolName = "llvm-ir2vec";
 

>From d7a26ceda8725e1f1f49992ec1f3f38bdf0d464e Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Thu, 1 Jan 2026 21:04:03 +0530
Subject: [PATCH 03/13] renaming emb-tool files to utils files

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt              | 2 +-
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp             | 2 +-
 llvm/tools/llvm-ir2vec/{emb-tool.cpp => utils.cpp} | 2 +-
 llvm/tools/llvm-ir2vec/{emb-tool.h => utils.h}     | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename llvm/tools/llvm-ir2vec/{emb-tool.cpp => utils.cpp} (99%)
 rename llvm/tools/llvm-ir2vec/{emb-tool.h => utils.h} (100%)

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index 9d5db8663fb38..1c9ab9b7fb173 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -19,7 +19,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_tool(llvm-ir2vec
   llvm-ir2vec.cpp
-  emb-tool.cpp
+  utils.cpp
   
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index a2b2f4e6a7aa8..95eaf0cd7fa32 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -54,7 +54,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "emb-tool.h"
+#include "utils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/IR2Vec.h"
 #include "llvm/IR/BasicBlock.h"
diff --git a/llvm/tools/llvm-ir2vec/emb-tool.cpp b/llvm/tools/llvm-ir2vec/utils.cpp
similarity index 99%
rename from llvm/tools/llvm-ir2vec/emb-tool.cpp
rename to llvm/tools/llvm-ir2vec/utils.cpp
index 891b26f8ef763..d938ae4abd236 100644
--- a/llvm/tools/llvm-ir2vec/emb-tool.cpp
+++ b/llvm/tools/llvm-ir2vec/utils.cpp
@@ -12,7 +12,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "emb-tool.h"
+#include "utils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/IR2Vec.h"
 #include "llvm/IR/BasicBlock.h"
diff --git a/llvm/tools/llvm-ir2vec/emb-tool.h b/llvm/tools/llvm-ir2vec/utils.h
similarity index 100%
rename from llvm/tools/llvm-ir2vec/emb-tool.h
rename to llvm/tools/llvm-ir2vec/utils.h

>From db94e29fcf8ad724f6acd30e9241b79d2dc71792 Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Thu, 1 Jan 2026 22:11:10 +0530
Subject: [PATCH 04/13] Nit commit, formatting fixups

---
 llvm/tools/llvm-ir2vec/utils.cpp | 4 ++--
 llvm/tools/llvm-ir2vec/utils.h   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/utils.cpp b/llvm/tools/llvm-ir2vec/utils.cpp
index d938ae4abd236..51c301bc0dbcb 100644
--- a/llvm/tools/llvm-ir2vec/utils.cpp
+++ b/llvm/tools/llvm-ir2vec/utils.cpp
@@ -1,4 +1,4 @@
-//===- emb-tool.cpp - IR2Vec/MIR2Vec Embedding Generation Tool -----------===//
+//===- utils.cpp - IR2Vec/MIR2Vec Embedding Generation Tool -----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -418,4 +418,4 @@ void MIR2VecTool::writeEmbeddingsToStream(MachineFunction &MF, raw_ostream &OS,
 
 } // namespace mir2vec
 
-} // namespace llvm
\ No newline at end of file
+} // namespace llvm
diff --git a/llvm/tools/llvm-ir2vec/utils.h b/llvm/tools/llvm-ir2vec/utils.h
index 24b8022ed7482..1fcddb211259a 100644
--- a/llvm/tools/llvm-ir2vec/utils.h
+++ b/llvm/tools/llvm-ir2vec/utils.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_IR2VEC_EMB_TOOL_H
-#define LLVM_TOOLS_LLVM_IR2VEC_EMB_TOOL_H
+#ifndef LLVM_TOOLS_LLVM_IR2VEC_UTILS_H
+#define LLVM_TOOLS_LLVM_IR2VEC_UTILS_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/IR2Vec.h"
@@ -198,4 +198,4 @@ struct MIRContext {
 
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_IR2VEC_EMB_TOOL_H
\ No newline at end of file
+#endif // LLVM_TOOLS_LLVM_IR2VEC_UTILS_H

>From f2d912272c49cd6414590f4b15997ca2e40af900 Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Tue, 6 Jan 2026 20:21:51 +0530
Subject: [PATCH 05/13] Work Commit - Moving utils files to a different folder,
 and using them as a lib object. This will help keep the build process and
 code minimal for the upcoming python bindings

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt        | 46 ++++++++++----------
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp       |  2 +-
 llvm/tools/llvm-ir2vec/{ => utils}/utils.cpp | 43 ++++++++----------
 llvm/tools/llvm-ir2vec/{ => utils}/utils.h   |  6 +--
 4 files changed, 43 insertions(+), 54 deletions(-)
 rename llvm/tools/llvm-ir2vec/{ => utils}/utils.cpp (89%)
 rename llvm/tools/llvm-ir2vec/{ => utils}/utils.h (98%)

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index 1c9ab9b7fb173..a009881c1c58c 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -1,26 +1,24 @@
 set(LLVM_LINK_COMPONENTS
-  # Core LLVM components for IR processing
-  Analysis
-  Core
-  IRReader
-  Support
-  
-  # Machine IR components (for -mode=mir)
-  CodeGen           
-  MIRParser         
-  
-  # Target initialization (required for MIR parsing)
-  AllTargetsAsmParsers
-  AllTargetsCodeGens
-  AllTargetsDescs
-  AllTargetsInfos
-  TargetParser
-  )
+  Analysis Core Demangle IRReader Support CodeGen MIRParser
+  AllTargetsAsmParsers AllTargetsCodeGens AllTargetsDescs 
+  AllTargetsInfos TargetParser
+)
 
-add_llvm_tool(llvm-ir2vec
-  llvm-ir2vec.cpp
-  utils.cpp
-  
-  DEPENDS
-  intrinsics_gen
-  )
+# Static utility library
+add_library(ir2vec_utils STATIC utils/utils.cpp)
+
+target_include_directories(ir2vec_utils 
+  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/utils
+  PRIVATE ${LLVM_INCLUDE_DIRS}
+)
+
+target_link_libraries(ir2vec_utils PRIVATE
+  LLVMAnalysis LLVMCore LLVMSupport LLVMIRReader 
+  LLVMAsmParser LLVMPasses LLVMDemangle
+)
+
+target_compile_definitions(ir2vec_utils PRIVATE ${LLVM_DEFINITIONS})
+
+# Main executable
+add_llvm_tool(llvm-ir2vec llvm-ir2vec.cpp DEPENDS intrinsics_gen)
+target_link_libraries(llvm-ir2vec PRIVATE ir2vec_utils)
\ No newline at end of file
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 95eaf0cd7fa32..165af89058b17 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -399,4 +399,4 @@ int main(int argc, char **argv) {
   }
 
   return 0;
-}
+}
\ No newline at end of file
diff --git a/llvm/tools/llvm-ir2vec/utils.cpp b/llvm/tools/llvm-ir2vec/utils/utils.cpp
similarity index 89%
rename from llvm/tools/llvm-ir2vec/utils.cpp
rename to llvm/tools/llvm-ir2vec/utils/utils.cpp
index 51c301bc0dbcb..26ff2e00ee355 100644
--- a/llvm/tools/llvm-ir2vec/utils.cpp
+++ b/llvm/tools/llvm-ir2vec/utils/utils.cpp
@@ -10,7 +10,7 @@
 /// This file implements the IR2VecTool and MIR2VecTool classes for
 /// IR2Vec/MIR2Vec embedding generation.
 ///
-//===----------------------------------------------------------------------===//
+//===------------------------------------------------------------------Fail----===//
 
 #include "utils.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -33,7 +33,6 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/Support/WithColor.h"
 #include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "ir2vec"
@@ -152,8 +151,7 @@ void IR2VecTool::writeEntitiesToStream(raw_ostream &OS) {
 void IR2VecTool::writeEmbeddingsToStream(raw_ostream &OS,
                                          EmbeddingLevel Level) const {
   if (!Vocab->isValid()) {
-    WithColor::error(errs(), ToolName)
-        << "Vocabulary is not valid. IR2VecTool not initialized.\n";
+    errs() << "Error: Vocabulary is not valid. IR2VecTool not initialized.\n";
     return;
   }
 
@@ -164,8 +162,7 @@ void IR2VecTool::writeEmbeddingsToStream(raw_ostream &OS,
 void IR2VecTool::writeEmbeddingsToStream(const Function &F, raw_ostream &OS,
                                          EmbeddingLevel Level) const {
   if (!Vocab || !Vocab->isValid()) {
-    WithColor::error(errs(), ToolName)
-        << "Vocabulary is not valid. IR2VecTool not initialized.\n";
+    errs() << "Error: Vocabulary is not valid. IR2VecTool not initialized.\n";
     return;
   }
   if (F.isDeclaration()) {
@@ -176,8 +173,8 @@ void IR2VecTool::writeEmbeddingsToStream(const Function &F, raw_ostream &OS,
   // Create embedder for this function
   auto Emb = Embedder::create(IR2VecEmbeddingKind, F, *Vocab);
   if (!Emb) {
-    WithColor::error(errs(), ToolName)
-        << "Failed to create embedder for function " << F.getName() << "\n";
+    errs() << "Error: Failed to create embedder for function " << F.getName()
+           << "\n";
     return;
   }
 
@@ -232,17 +229,15 @@ bool MIR2VecTool::initializeVocabularyForLayout(const Module &M) {
 
     auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(TII, TRI, MRI, 1);
     if (!VocabOrErr) {
-      WithColor::error(errs(), ToolName)
-          << "Failed to create dummy vocabulary - "
-          << toString(VocabOrErr.takeError()) << "\n";
+      errs() << "Error: Failed to create dummy vocabulary - "
+             << toString(VocabOrErr.takeError()) << "\n";
       return false;
     }
     Vocab = std::make_unique<MIRVocabulary>(std::move(*VocabOrErr));
     return true;
   }
 
-  WithColor::error(errs(), ToolName)
-      << "No machine functions found to initialize vocabulary\n";
+  errs() << "Error: No machine functions found to initialize vocabulary\n";
   return false;
 }
 
@@ -251,8 +246,8 @@ TripletResult MIR2VecTool::generateTriplets(const MachineFunction &MF) const {
   Result.MaxRelation = MIRNextRelation;
 
   if (!Vocab) {
-    WithColor::error(errs(), ToolName)
-        << "MIR Vocabulary must be initialized for triplet generation.\n";
+    errs() << "Error: MIR Vocabulary must be initialized for triplet "
+              "generation.\n";
     return Result;
   }
 
@@ -309,8 +304,7 @@ TripletResult MIR2VecTool::generateTriplets(const Module &M) const {
   for (const Function &F : M.getFunctionDefs()) {
     MachineFunction *MF = MMI.getMachineFunction(F);
     if (!MF) {
-      WithColor::warning(errs(), ToolName)
-          << "No MachineFunction for " << F.getName() << "\n";
+      errs() << "Warning: No MachineFunction for " << F.getName() << "\n";
       continue;
     }
 
@@ -333,8 +327,7 @@ void MIR2VecTool::writeTripletsToStream(const Module &M,
 
 EntityList MIR2VecTool::collectEntityMappings() const {
   if (!Vocab) {
-    WithColor::error(errs(), ToolName)
-        << "Vocabulary must be initialized for entity mappings.\n";
+    errs() << "Error: Vocabulary must be initialized for entity mappings.\n";
     return {};
   }
 
@@ -359,15 +352,14 @@ void MIR2VecTool::writeEntitiesToStream(raw_ostream &OS) const {
 void MIR2VecTool::writeEmbeddingsToStream(const Module &M, raw_ostream &OS,
                                           EmbeddingLevel Level) const {
   if (!Vocab) {
-    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
+    errs() << "Error: Vocabulary not initialized.\n";
     return;
   }
 
   for (const Function &F : M.getFunctionDefs()) {
     MachineFunction *MF = MMI.getMachineFunction(F);
     if (!MF) {
-      WithColor::warning(errs(), ToolName)
-          << "No MachineFunction for " << F.getName() << "\n";
+      errs() << "Warning: No MachineFunction for " << F.getName() << "\n";
       continue;
     }
 
@@ -378,14 +370,13 @@ void MIR2VecTool::writeEmbeddingsToStream(const Module &M, raw_ostream &OS,
 void MIR2VecTool::writeEmbeddingsToStream(MachineFunction &MF, raw_ostream &OS,
                                           EmbeddingLevel Level) const {
   if (!Vocab) {
-    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
+    errs() << "Error: Vocabulary not initialized.\n";
     return;
   }
 
   auto Emb = MIREmbedder::create(MIR2VecKind::Symbolic, MF, *Vocab);
   if (!Emb) {
-    WithColor::error(errs(), ToolName)
-        << "Failed to create embedder for " << MF.getName() << "\n";
+    errs() << "Error: Failed to create embedder for " << MF.getName() << "\n";
     return;
   }
 
@@ -418,4 +409,4 @@ void MIR2VecTool::writeEmbeddingsToStream(MachineFunction &MF, raw_ostream &OS,
 
 } // namespace mir2vec
 
-} // namespace llvm
+} // namespace llvm
\ No newline at end of file
diff --git a/llvm/tools/llvm-ir2vec/utils.h b/llvm/tools/llvm-ir2vec/utils/utils.h
similarity index 98%
rename from llvm/tools/llvm-ir2vec/utils.h
rename to llvm/tools/llvm-ir2vec/utils/utils.h
index 1fcddb211259a..13ce3e500ee2d 100644
--- a/llvm/tools/llvm-ir2vec/utils.h
+++ b/llvm/tools/llvm-ir2vec/utils/utils.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TOOLS_LLVM_IR2VEC_UTILS_H
-#define LLVM_TOOLS_LLVM_IR2VEC_UTILS_H
+#ifndef LLVM_TOOLS_LLVM_IR2VEC_UTILS_UTILS_H
+#define LLVM_TOOLS_LLVM_IR2VEC_UTILS_UTILS_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/IR2Vec.h"
@@ -198,4 +198,4 @@ struct MIRContext {
 
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_IR2VEC_UTILS_H
+#endif // LLVM_TOOLS_LLVM_IR2VEC_UTILS_UTILS_H
\ No newline at end of file

>From f384164c9901ff85db48d6b3130e140c7c67704e Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Tue, 6 Jan 2026 20:23:37 +0530
Subject: [PATCH 06/13] Nit commit - formatting commit

---
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 2 +-
 llvm/tools/llvm-ir2vec/utils/utils.cpp | 2 +-
 llvm/tools/llvm-ir2vec/utils/utils.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 165af89058b17..95eaf0cd7fa32 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -399,4 +399,4 @@ int main(int argc, char **argv) {
   }
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/llvm/tools/llvm-ir2vec/utils/utils.cpp b/llvm/tools/llvm-ir2vec/utils/utils.cpp
index 26ff2e00ee355..e7b0f02cb4069 100644
--- a/llvm/tools/llvm-ir2vec/utils/utils.cpp
+++ b/llvm/tools/llvm-ir2vec/utils/utils.cpp
@@ -409,4 +409,4 @@ void MIR2VecTool::writeEmbeddingsToStream(MachineFunction &MF, raw_ostream &OS,
 
 } // namespace mir2vec
 
-} // namespace llvm
\ No newline at end of file
+} // namespace llvm
diff --git a/llvm/tools/llvm-ir2vec/utils/utils.h b/llvm/tools/llvm-ir2vec/utils/utils.h
index 13ce3e500ee2d..58a4551947536 100644
--- a/llvm/tools/llvm-ir2vec/utils/utils.h
+++ b/llvm/tools/llvm-ir2vec/utils/utils.h
@@ -198,4 +198,4 @@ struct MIRContext {
 
 } // namespace llvm
 
-#endif // LLVM_TOOLS_LLVM_IR2VEC_UTILS_UTILS_H
\ No newline at end of file
+#endif // LLVM_TOOLS_LLVM_IR2VEC_UTILS_UTILS_H

>From 9fd41adb03f075112929a2478a4539ea60a46178 Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Tue, 6 Jan 2026 20:27:52 +0530
Subject: [PATCH 07/13] Nit commit - Cmakelists.txt code layout fixup

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index a009881c1c58c..eab03d51d7c6a 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -1,8 +1,21 @@
 set(LLVM_LINK_COMPONENTS
-  Analysis Core Demangle IRReader Support CodeGen MIRParser
-  AllTargetsAsmParsers AllTargetsCodeGens AllTargetsDescs 
-  AllTargetsInfos TargetParser
-)
+  # Core LLVM components for IR processing
+  Analysis
+  Core
+  IRReader
+  Support
+  
+  # Machine IR components (for -mode=mir)
+  CodeGen           
+  MIRParser         
+  
+  # Target initialization (required for MIR parsing)
+  AllTargetsAsmParsers
+  AllTargetsCodeGens
+  AllTargetsDescs
+  AllTargetsInfos
+  TargetParser
+  )
 
 # Static utility library
 add_library(ir2vec_utils STATIC utils/utils.cpp)

>From f59221f3cab53c8ebe0ff3ed7ffc775c6a0bc12a Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Tue, 6 Jan 2026 20:42:32 +0530
Subject: [PATCH 08/13] Nit commit - changing ir2vec_utils to emb_utils

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index eab03d51d7c6a..49c45bd2d345d 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -18,20 +18,26 @@ set(LLVM_LINK_COMPONENTS
   )
 
 # Static utility library
-add_library(ir2vec_utils STATIC utils/utils.cpp)
+add_library(emb_utils STATIC utils/utils.cpp)
 
-target_include_directories(ir2vec_utils 
+target_include_directories(emb_utils 
   PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/utils
   PRIVATE ${LLVM_INCLUDE_DIRS}
 )
 
-target_link_libraries(ir2vec_utils PRIVATE
+target_link_libraries(emb_utils PRIVATE
   LLVMAnalysis LLVMCore LLVMSupport LLVMIRReader 
   LLVMAsmParser LLVMPasses LLVMDemangle
 )
 
-target_compile_definitions(ir2vec_utils PRIVATE ${LLVM_DEFINITIONS})
+target_compile_definitions(emb_utils PRIVATE ${LLVM_DEFINITIONS})
 
 # Main executable
-add_llvm_tool(llvm-ir2vec llvm-ir2vec.cpp DEPENDS intrinsics_gen)
-target_link_libraries(llvm-ir2vec PRIVATE ir2vec_utils)
\ No newline at end of file
+add_llvm_tool(llvm-ir2vec
+  llvm-ir2vec.cpp
+  
+  DEPENDS
+  intrinsics_gen
+  )
+
+target_link_libraries(llvm-ir2vec PRIVATE emb_utils)

>From 4611ae61b2a9c5edc2220d93fb0d42184b3d7ad9 Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Wed, 7 Jan 2026 01:11:44 +0530
Subject: [PATCH 09/13] Fixup commit - warning during llvm-ir2vec build causing
 failed CI job. fixed

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index 49c45bd2d345d..3eccffd350d5e 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -30,8 +30,6 @@ target_link_libraries(emb_utils PRIVATE
   LLVMAsmParser LLVMPasses LLVMDemangle
 )
 
-target_compile_definitions(emb_utils PRIVATE ${LLVM_DEFINITIONS})
-
 # Main executable
 add_llvm_tool(llvm-ir2vec
   llvm-ir2vec.cpp

>From 3d7b756309a0a011e16c49e294d63749bdf4c612 Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Wed, 7 Jan 2026 14:27:02 +0530
Subject: [PATCH 10/13] Nit commit - stray typo in utils.cpp file

---
 llvm/tools/llvm-ir2vec/utils/utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-ir2vec/utils/utils.cpp b/llvm/tools/llvm-ir2vec/utils/utils.cpp
index e7b0f02cb4069..c299b8d490cb4 100644
--- a/llvm/tools/llvm-ir2vec/utils/utils.cpp
+++ b/llvm/tools/llvm-ir2vec/utils/utils.cpp
@@ -10,7 +10,7 @@
 /// This file implements the IR2VecTool and MIR2VecTool classes for
 /// IR2Vec/MIR2Vec embedding generation.
 ///
-//===------------------------------------------------------------------Fail----===//
+//===----------------------------------------------------------------------===//
 
 #include "utils.h"
 #include "llvm/ADT/ArrayRef.h"

>From f6a0dce6621afcd543859240d44871f07e033e70 Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Fri, 9 Jan 2026 20:32:27 +0530
Subject: [PATCH 11/13] Building llvm-ir2vec utils lib from its own directory

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt       | 19 +++++--------------
 llvm/tools/llvm-ir2vec/utils/CMakeLists.txt |  8 ++++++++
 2 files changed, 13 insertions(+), 14 deletions(-)
 create mode 100644 llvm/tools/llvm-ir2vec/utils/CMakeLists.txt

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index 3eccffd350d5e..55a11a7f66927 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -17,18 +17,8 @@ set(LLVM_LINK_COMPONENTS
   TargetParser
   )
 
-# Static utility library
-add_library(emb_utils STATIC utils/utils.cpp)
-
-target_include_directories(emb_utils 
-  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/utils
-  PRIVATE ${LLVM_INCLUDE_DIRS}
-)
-
-target_link_libraries(emb_utils PRIVATE
-  LLVMAnalysis LLVMCore LLVMSupport LLVMIRReader 
-  LLVMAsmParser LLVMPasses LLVMDemangle
-)
+# Add the utils subdirectory for the library
+add_subdirectory(utils)
 
 # Main executable
 add_llvm_tool(llvm-ir2vec
@@ -36,6 +26,7 @@ add_llvm_tool(llvm-ir2vec
   
   DEPENDS
   intrinsics_gen
-  )
+)
 
-target_link_libraries(llvm-ir2vec PRIVATE emb_utils)
+target_include_directories(llvm-ir2vec PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/utils)
+target_link_libraries(llvm-ir2vec PRIVATE emb_utils)
\ No newline at end of file
diff --git a/llvm/tools/llvm-ir2vec/utils/CMakeLists.txt b/llvm/tools/llvm-ir2vec/utils/CMakeLists.txt
new file mode 100644
index 0000000000000..3b300ce8e6ff8
--- /dev/null
+++ b/llvm/tools/llvm-ir2vec/utils/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_llvm_library(emb_utils STATIC
+  utils.cpp
+)
+
+target_link_libraries(emb_utils PRIVATE
+  LLVMAnalysis LLVMCore LLVMSupport LLVMIRReader 
+  LLVMAsmParser LLVMPasses LLVMDemangle
+)
\ No newline at end of file

>From 225bb0a644d8f527bb0cd5f2fa97f72219d27e51 Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Fri, 9 Jan 2026 20:35:08 +0530
Subject: [PATCH 12/13] Work Commit - restoring WithColor usage for error
 messages for writeEmbeddingsToStream methods

---
 llvm/tools/llvm-ir2vec/utils/utils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/utils/utils.cpp b/llvm/tools/llvm-ir2vec/utils/utils.cpp
index c299b8d490cb4..9539ff3d09da1 100644
--- a/llvm/tools/llvm-ir2vec/utils/utils.cpp
+++ b/llvm/tools/llvm-ir2vec/utils/utils.cpp
@@ -352,7 +352,7 @@ void MIR2VecTool::writeEntitiesToStream(raw_ostream &OS) const {
 void MIR2VecTool::writeEmbeddingsToStream(const Module &M, raw_ostream &OS,
                                           EmbeddingLevel Level) const {
   if (!Vocab) {
-    errs() << "Error: Vocabulary not initialized.\n";
+    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
     return;
   }
 
@@ -370,7 +370,7 @@ void MIR2VecTool::writeEmbeddingsToStream(const Module &M, raw_ostream &OS,
 void MIR2VecTool::writeEmbeddingsToStream(MachineFunction &MF, raw_ostream &OS,
                                           EmbeddingLevel Level) const {
   if (!Vocab) {
-    errs() << "Error: Vocabulary not initialized.\n";
+    WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
     return;
   }
 

>From fa07f40298186b77bc47a732aa97367f8be8683f Mon Sep 17 00:00:00 2001
From: nishant-sachdeva <nishant.sachdeva at research.iiit.ac.in>
Date: Sun, 11 Jan 2026 16:48:03 +0530
Subject: [PATCH 13/13] Nit commit - changes to linking code and formatting
 edits

---
 llvm/tools/llvm-ir2vec/CMakeLists.txt       |  2 +-
 llvm/tools/llvm-ir2vec/utils/CMakeLists.txt | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index 55a11a7f66927..dfee676d8a148 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -29,4 +29,4 @@ add_llvm_tool(llvm-ir2vec
 )
 
 target_include_directories(llvm-ir2vec PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/utils)
-target_link_libraries(llvm-ir2vec PRIVATE emb_utils)
\ No newline at end of file
+target_link_libraries(llvm-ir2vec PRIVATE emb_utils)
diff --git a/llvm/tools/llvm-ir2vec/utils/CMakeLists.txt b/llvm/tools/llvm-ir2vec/utils/CMakeLists.txt
index 3b300ce8e6ff8..1d51fb3cd15fc 100644
--- a/llvm/tools/llvm-ir2vec/utils/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/utils/CMakeLists.txt
@@ -1,8 +1,12 @@
 add_llvm_library(emb_utils STATIC
   utils.cpp
+  
+  LINK_COMPONENTS
+  Analysis
+  Core
+  Support
+  IRReader
+  AsmParser
+  Passes
+  Demangle
 )
-
-target_link_libraries(emb_utils PRIVATE
-  LLVMAnalysis LLVMCore LLVMSupport LLVMIRReader 
-  LLVMAsmParser LLVMPasses LLVMDemangle
-)
\ No newline at end of file



More information about the llvm-commits mailing list