[llvm-branch-commits] [llvm] [IR2Vec][llvm-ir2vec] Supporting	flow-aware embeddings (PR #153087)
    S. VenkataKeerthy via llvm-branch-commits 
    llvm-branch-commits at lists.llvm.org
       
    Wed Aug 27 13:06:22 PDT 2025
    
    
  
https://github.com/svkeerthy updated https://github.com/llvm/llvm-project/pull/153087
>From 630f4afd73c33d6b375ac787d587497d8d8e44d8 Mon Sep 17 00:00:00 2001
From: svkeerthy <venkatakeerthy at google.com>
Date: Mon, 11 Aug 2025 21:04:37 +0000
Subject: [PATCH] [IR2Vec][llvm-ir2vec] Supporting flow-aware embeddings
---
 llvm/docs/CommandGuide/llvm-ir2vec.rst        | 16 +++-
 .../tools/llvm-ir2vec/embeddings-flowaware.ll | 73 +++++++++++++++++++
 .../{embeddings.ll => embeddings-symbolic.ll} |  0
 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp        | 10 ++-
 4 files changed, 93 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
 rename llvm/test/tools/llvm-ir2vec/{embeddings.ll => embeddings-symbolic.ll} (100%)
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index 0c9fb6e94b6f3..fc590a6180316 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -13,7 +13,9 @@ DESCRIPTION
 
 :program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
 generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. The tool provides three main subcommands:
+for vocabulary training. 
+
+The tool provides three main subcommands:
 
 1. **triplets**: Generates numeric triplets in train2id format for vocabulary
    training from LLVM IR.
@@ -93,7 +95,7 @@ Example Usage:
 
 .. code-block:: bash
 
-   llvm-ir2vec embeddings --ir2vec-vocab-path=vocab.json --level=func input.bc -o embeddings.txt
+   llvm-ir2vec embeddings --ir2vec-vocab-path=vocab.json --ir2vec-kind=symbolic --level=func input.bc -o embeddings.txt
 
 OPTIONS
 -------
@@ -129,6 +131,16 @@ Subcommand-specific options:
 
    Process only the specified function instead of all functions in the module.
 
+.. option:: --ir2vec-kind=<kind>
+
+   Specify the kind of IR2Vec embeddings to generate. Valid values are:
+
+   * ``symbolic`` - Generate symbolic embeddings (default)
+   * ``flow-aware`` - Generate flow-aware embeddings
+
+   Flow-aware embeddings consider control flow relationships between instructions,
+   while symbolic embeddings focus on the symbolic representation of instructions.
+
 .. option:: --ir2vec-vocab-path=<path>
 
    Specify the path to the vocabulary file (required for embedding generation).
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
new file mode 100644
index 0000000000000..b2362f83caf4f
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
@@ -0,0 +1,73 @@
+; RUN: llvm-ir2vec embeddings --ir2vec-kind=flow-aware --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+; RUN: llvm-ir2vec embeddings --level=func --ir2vec-kind=flow-aware  --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+; RUN: llvm-ir2vec embeddings --level=func --function=abc --ir2vec-kind=flow-aware --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC
+; RUN: not llvm-ir2vec embeddings --level=func --function=def --ir2vec-kind=flow-aware --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF
+; RUN: llvm-ir2vec embeddings --level=bb --ir2vec-kind=flow-aware --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+; RUN: llvm-ir2vec embeddings --level=bb --function=abc_repeat --ir2vec-kind=flow-aware --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT
+; RUN: llvm-ir2vec embeddings --level=inst --function=abc_repeat --ir2vec-kind=flow-aware --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT
+
+define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) #0 {
+entry:
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca float, align 4
+  store i32 %a, ptr %a.addr, align 4
+  store float %b, ptr %b.addr, align 4
+  %0 = load i32, ptr %a.addr, align 4
+  %1 = load i32, ptr %a.addr, align 4
+  %mul = mul nsw i32 %0, %1
+  %conv = sitofp i32 %mul to float
+  %2 = load float, ptr %b.addr, align 4
+  %add = fadd float %conv, %2
+  ret float %add
+}
+
+; CHECK-DEFAULT: Function: abc
+; CHECK-DEFAULT-NEXT: [ 3630.00  3672.00  3714.00 ]
+; CHECK-DEFAULT-NEXT: Function: abc_repeat
+; CHECK-DEFAULT-NEXT: [ 3630.00  3672.00  3714.00 ]
+
+; CHECK-FUNC-LEVEL: Function: abc 
+; CHECK-FUNC-LEVEL-NEXT: [ 3630.00  3672.00  3714.00 ]
+; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat 
+; CHECK-FUNC-LEVEL-NEXT: [ 3630.00  3672.00  3714.00 ]
+
+; CHECK-FUNC-LEVEL-ABC: Function: abc
+; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 3630.00  3672.00  3714.00 ]
+
+; CHECK-FUNC-DEF: Error: Function 'def' not found
+
+; CHECK-BB-LEVEL: Function: abc
+; CHECK-BB-LEVEL-NEXT: entry: [ 3630.00  3672.00  3714.00 ]
+; CHECK-BB-LEVEL-NEXT: Function: abc_repeat
+; CHECK-BB-LEVEL-NEXT: entry: [ 3630.00  3672.00  3714.00 ]
+
+; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 3630.00  3672.00  3714.00 ]
+
+; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00  92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00  92.00  93.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 188.00  190.00  192.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4 [ 188.00  190.00  192.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %0 = load i32, ptr %a.addr, align 4 [ 185.00  187.00  189.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %1 = load i32, ptr %a.addr, align 4 [ 185.00  187.00  189.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %mul = mul nsw i32 %0, %1 [ 419.00  424.00  429.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %conv = sitofp i32 %mul to float [ 549.00  555.00  561.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %2 = load float, ptr %b.addr, align 4 [ 185.00  187.00  189.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %add = fadd float %conv, %2 [ 774.00  783.00  792.00 ]
+; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: ret float %add [ 775.00  785.00  795.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
similarity index 100%
rename from llvm/test/tools/llvm-ir2vec/embeddings.ll
rename to llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 8e17a4a3ab53d..8f8b4e2f2bda8 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -25,9 +25,11 @@
 /// 3. Embedding Generation (embeddings):
 ///    Generates IR2Vec embeddings using a trained vocabulary.
 ///    Usage: llvm-ir2vec embeddings --ir2vec-vocab-path=vocab.json
-///    --level=func input.bc -o embeddings.txt Levels: --level=inst
-///    (instructions), --level=bb (basic blocks), --level=func (functions)
-///    (See IR2Vec.cpp for more embedding generation options)
+///    --ir2vec-kind=<kind> --level=<level> input.bc -o embeddings.txt
+///    Kind: --ir2vec-kind=symbolic (default), --ir2vec-kind=flow-aware
+///    Levels: --level=inst (instructions), --level=bb (basic blocks),
+///    --level=func (functions) (See IR2Vec.cpp for more embedding generation
+///    options)
 ///
 //===----------------------------------------------------------------------===//
 
@@ -243,7 +245,7 @@ class IR2VecTool {
 
     // Create embedder for this function
     assert(Vocab->isValid() && "Vocabulary is not valid");
-    auto Emb = Embedder::create(IR2VecKind::Symbolic, F, *Vocab);
+    auto Emb = Embedder::create(IR2VecEmbeddingKind, F, *Vocab);
     if (!Emb) {
       OS << "Error: Failed to create embedder for function " << F.getName()
          << "\n";
    
    
More information about the llvm-branch-commits
mailing list