[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)
Aiden Grossman via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jul 22 08:47:10 PDT 2025
================
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+ python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+ """Result from processing a single LLVM IR file"""
+
+ __slots__ = ["triplets", "max_relation"]
+
+ def __init__(self, triplets: Set[str], max_relation: int):
+ self.triplets = triplets
+ self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+ """Main class for generating IR2Vec triplets"""
+
+ def __init__(
+ self,
+ llvm_build_dir: Path,
+ num_optimizations: int,
+ output_dir: Path,
+ max_workers: int = DEFAULT_MAX_WORKERS,
+ ):
+ self.llvm_build_dir = llvm_build_dir
+ self.num_optimizations = num_optimizations
+ self.output_dir = output_dir
+ self.max_workers = max_workers
+
+ # Tool paths
+ self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+ self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+ self._validate_setup()
+
+ def _validate_setup(self):
+ """Validate that all required tools and paths exist"""
+ if not self.llvm_build_dir.exists():
+ raise FileNotFoundError(
+ f"LLVM build directory not found: {self.llvm_build_dir}"
+ )
+
+ if not os.path.isfile(self.opt_binary) or not os.access(
+ self.opt_binary, os.X_OK
+ ):
+ raise FileNotFoundError(
+ f"opt binary not found or not executable: {self.opt_binary}"
+ )
+
+ if not os.path.isfile(self.ir2vec_binary) or not os.access(
+ self.ir2vec_binary, os.X_OK
+ ):
+ raise FileNotFoundError(
+ f"llvm-ir2vec binary not found or not executable: {self.ir2vec_binary}"
+ )
+
+ if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+ raise ValueError(
+ f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+ )
+
+ self.output_dir.mkdir(parents=True, exist_ok=True)
----------------
boomanaiden154 wrote:
This should probably be somewhere outside of `_validate_setup`? Slightly odd to be in here although I can see the motivation (validating that the output path exists in a way).
https://github.com/llvm/llvm-project/pull/149215
More information about the llvm-branch-commits
mailing list