[llvm] [NFC][llvm-ir2vec] llvm_ir2vec.cpp breakup to extract a reusable header for IR2VecTool, and MIR2VecTool classes (PR #172304)

S. VenkataKeerthy via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 16 23:25:22 PST 2025


================
@@ -0,0 +1,534 @@
+//===- llvm-ir2vec.h - IR2Vec/MIR2Vec Tool Classes ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the IR2VecTool and MIR2VecTool class definitions and
+/// implementations for the llvm-ir2vec embedding generation tool.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_IR2VEC_LLVM_MIR2VEC_H
+#define LLVM_TOOLS_LLVM_IR2VEC_LLVM_MIR2VEC_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "ir2vec"
+
+namespace llvm {
+
+/// Tool name for error reporting
+static const char *ToolName = "llvm-ir2vec";
+
+/// Specifies the granularity at which embeddings are generated.
+enum EmbeddingLevel {
+  InstructionLevel, // Generate instruction-level embeddings
+  BasicBlockLevel,  // Generate basic block-level embeddings
+  FunctionLevel     // Generate function-level embeddings
+};
+
+/// Represents a single knowledge graph triplet (Head, Relation, Tail)
+/// where indices reference entities in an EntityList
+struct Triplet {
+  unsigned Head = 0;     ///< Index of the head entity in the entity list
+  unsigned Tail = 0;     ///< Index of the tail entity in the entity list
+  unsigned Relation = 0; ///< Relation type (see RelationType enum)
+};
+
+/// Result structure containing all generated triplets and metadata
+struct TripletResult {
+  unsigned MaxRelation =
+      0; ///< Highest relation index used (for ArgRelation + N)
+  std::vector<Triplet> Triplets; ///< Collection of all generated triplets
+};
+
+/// Entity mappings: [entity_name]
+using EntityList = std::vector<std::string>;
+
+namespace ir2vec {
+
+/// Relation types for triplet generation
+enum RelationType {
+  TypeRelation = 0, ///< Instruction to type relationship
+  NextRelation = 1, ///< Sequential instruction relationship
+  ArgRelation = 2   ///< Instruction to operand relationship (ArgRelation + N)
+};
+
+/// Helper class for collecting IR triplets and generating embeddings
+class IR2VecTool {
+private:
+  Module &M;
+  ModuleAnalysisManager MAM;
+  const Vocabulary *Vocab = nullptr;
+
+public:
+  explicit IR2VecTool(Module &M) : M(M) {}
+
+  /// Initialize the IR2Vec vocabulary analysis
+  bool initializeVocabulary() {
+    // Register and run the IR2Vec vocabulary analysis
+    // The vocabulary file path is specified via --ir2vec-vocab-path global
+    // option
+    MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
+    MAM.registerPass([&] { return IR2VecVocabAnalysis(); });
+    // This will throw an error if vocab is not found or invalid
+    Vocab = &MAM.getResult<IR2VecVocabAnalysis>(M);
+    return Vocab->isValid();
+  }
+
+  /// Generate triplets for a single function
+  /// Returns a TripletResult with:
+  ///   - Triplets: vector of all (subject, object, relation) tuples
+  ///   - MaxRelation: highest Arg relation ID used, or NextRelation if none
+  TripletResult generateTriplets(const Function &F) const {
----------------
svkeerthy wrote:

(Here and other places) In general, we can have implementations of not-so-small functions in the source file.

https://github.com/llvm/llvm-project/pull/172304


More information about the llvm-commits mailing list