[llvm] [NFC][llvm-ir2vec] llvm_ir2vec.cpp breakup to extract a reusable header for IR2VecTool, and MIR2VecTool classes (PR #172304)
S. VenkataKeerthy via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 16 23:25:22 PST 2025
================
@@ -0,0 +1,534 @@
+//===- llvm-ir2vec.h - IR2Vec/MIR2Vec Tool Classes ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the IR2VecTool and MIR2VecTool class definitions and
+/// implementations for the llvm-ir2vec embedding generation tool.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_IR2VEC_LLVM_MIR2VEC_H
+#define LLVM_TOOLS_LLVM_IR2VEC_LLVM_MIR2VEC_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/IR2Vec.h"
+#include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassInstrumentation.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+#define DEBUG_TYPE "ir2vec"
+
+namespace llvm {
+
+/// Tool name for error reporting
+static const char *ToolName = "llvm-ir2vec";
+
+/// Specifies the granularity at which embeddings are generated.
+enum EmbeddingLevel {
+ InstructionLevel, // Generate instruction-level embeddings
+ BasicBlockLevel, // Generate basic block-level embeddings
+ FunctionLevel // Generate function-level embeddings
+};
+
+/// Represents a single knowledge graph triplet (Head, Relation, Tail)
+/// where indices reference entities in an EntityList
+struct Triplet {
+ unsigned Head = 0; ///< Index of the head entity in the entity list
+ unsigned Tail = 0; ///< Index of the tail entity in the entity list
+ unsigned Relation = 0; ///< Relation type (see RelationType enum)
+};
+
+/// Result structure containing all generated triplets and metadata
+struct TripletResult {
+ unsigned MaxRelation =
+ 0; ///< Highest relation index used (for ArgRelation + N)
+ std::vector<Triplet> Triplets; ///< Collection of all generated triplets
+};
+
+/// Entity mappings: [entity_name]
+using EntityList = std::vector<std::string>;
+
+namespace ir2vec {
+
+/// Relation types for triplet generation
+enum RelationType {
+ TypeRelation = 0, ///< Instruction to type relationship
+ NextRelation = 1, ///< Sequential instruction relationship
+ ArgRelation = 2 ///< Instruction to operand relationship (ArgRelation + N)
+};
+
+/// Helper class for collecting IR triplets and generating embeddings
+class IR2VecTool {
+private:
+ Module &M;
+ ModuleAnalysisManager MAM;
+ const Vocabulary *Vocab = nullptr;
+
+public:
+ explicit IR2VecTool(Module &M) : M(M) {}
+
+ /// Initialize the IR2Vec vocabulary analysis
+ bool initializeVocabulary() {
+ // Register and run the IR2Vec vocabulary analysis
+ // The vocabulary file path is specified via --ir2vec-vocab-path global
+ // option
+ MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
+ MAM.registerPass([&] { return IR2VecVocabAnalysis(); });
+ // This will throw an error if vocab is not found or invalid
+ Vocab = &MAM.getResult<IR2VecVocabAnalysis>(M);
+ return Vocab->isValid();
+ }
+
+ /// Generate triplets for a single function
+ /// Returns a TripletResult with:
+ /// - Triplets: vector of all (subject, object, relation) tuples
+ /// - MaxRelation: highest Arg relation ID used, or NextRelation if none
+ TripletResult generateTriplets(const Function &F) const {
----------------
svkeerthy wrote:
(Here and other places) In general, we can have implementations of not-so-small functions in the source file.
https://github.com/llvm/llvm-project/pull/172304
More information about the llvm-commits
mailing list