[llvm] Adding IR2Vec as an analysis pass (PR #134004)
S. VenkataKeerthy via llvm-commits
llvm-commits at lists.llvm.org
Tue May 13 14:14:56 PDT 2025
================
@@ -0,0 +1,435 @@
+//===- IR2VecAnalysis.cpp - IR2Vec Analysis Implementation ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions. See the LICENSE file for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the IR2Vec algorithm.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/IR2VecAnalysis.h"
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace ir2vec;
+
+#define DEBUG_TYPE "ir2vec"
+
+STATISTIC(DataMissCounter, "Number of data misses in the vocabulary");
+
+/// IR2Vec computes two kinds of embeddings: Symbolic and Flow-aware.
+/// Symbolic embeddings capture the "syntactic" and "statistical correlation"
+/// of the IR entities. Flow-aware embeddings build on top of symbolic
+/// embeddings and additionally capture the flow information in the IR.
+/// IR2VecKind is used to specify the type of embeddings to generate.
+// FIXME: Currently we support only Symbolic. Add support for
+// Flow-aware in upcoming patches.
+enum class IR2VecKind { Symbolic, FlowAware };
+
+static cl::OptionCategory IR2VecAnalysisCategory("IR2Vec Analysis Options");
+
+cl::opt<IR2VecKind>
+ IR2VecMode("ir2vec-mode",
+ cl::desc("Choose type of embeddings to generate:"),
+ cl::values(clEnumValN(IR2VecKind::Symbolic, "symbolic",
+ "Generates symbolic embeddings"),
+ clEnumValN(IR2VecKind::FlowAware, "flow-aware",
+ "Generates flow-aware embeddings")),
+ cl::init(IR2VecKind::Symbolic), cl::cat(IR2VecAnalysisCategory));
+
+// FIXME: Use a default vocab when not specified
+static cl::opt<std::string>
+ VocabFile("ir2vec-vocab-path", cl::Optional,
+ cl::desc("Path to the vocabulary file for IR2Vec"), cl::init(""),
+ cl::cat(IR2VecAnalysisCategory));
+
+AnalysisKey IR2VecVocabAnalysis::Key;
+AnalysisKey IR2VecAnalysis::Key;
+
+// ==----------------------------------------------------------------------===//
+// Embeddings and its subclasses
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// Embeddings provides the interface to generate vector representations for
+/// instructions, basic blocks, and functions. The vector
+/// representations are generated using IR2Vec algorithms.
+///
+/// The Embeddings class is an abstract class and it is intended to be
+/// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware.
+class Embeddings {
+protected:
+ const Function &F;
+ Vocab Vocabulary;
+
+ /// Weights for different entities (like opcode, arguments, types)
+ /// in the IR instructions to generate the vector representation.
+ // FIXME: Defaults to the values used in the original algorithm. Can be
+ // parameterized later.
+ const float OpcWeight = 1.0, TypeWeight = 0.5, ArgWeight = 0.2;
+
+ /// Dimension of the vector representation; captured from the input vocabulary
+ const unsigned Dimension = 300;
+
+ // Utility maps - these are used to store the vector representations of
+ // instructions, basic blocks and functions.
+ Embedding FuncVector;
+ SmallMapVector<const BasicBlock *, Embedding, 16> BBVecMap;
+ SmallMapVector<const Instruction *, Embedding, 128> InstVecMap;
+
+ Embeddings(const Function &F, const Vocab &Vocabulary, unsigned Dimension)
+ : F(F), Vocabulary(Vocabulary), Dimension(Dimension) {}
+
+ /// Lookup vocabulary for a given Key. If the key is not found, it returns a
+ /// zero vector.
+ Embedding lookupVocab(const std::string &Key);
+
+public:
+ virtual ~Embeddings() = default;
+
+ /// Top level function to compute embeddings. Given a function, it
+ /// generates embeddings for all the instructions and basic blocks in that
+ /// function. Logic of computing the embeddings is specific to the kind of
+ /// embeddings being computed.
+ virtual void computeEmbeddings() = 0;
+
+ /// Returns a map containing instructions and the corresponding vector
+ /// representations for a given module corresponding to the IR2Vec
+ /// algorithm.
+ const SmallMapVector<const Instruction *, Embedding, 128> &
+ getInstVecMap() const {
+ return InstVecMap;
+ }
+
+ /// Returns a map containing basic block and the corresponding vector
+ /// representations for a given module corresponding to the IR2Vec
+ /// algorithm.
----------------
svkeerthy wrote:
The Embeddings class and its methods are not exposed (as it was not necessary till now). So added these comments here. Let me know if you'd like me to move it to header.
https://github.com/llvm/llvm-project/pull/134004
More information about the llvm-commits
mailing list