[llvm-branch-commits] [llvm] [IR2Vec] Add triplet generation utility script for vocabulary training (PR #149215)
Aiden Grossman via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jul 22 08:47:10 PDT 2025
================
@@ -0,0 +1,291 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""IR2Vec Triplet Generator
+
+Generates IR2Vec triplets by applying random optimization levels to LLVM IR files
+and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
+files: entity2id.txt, relation2id.txt, and train2id.txt.
+
+Usage:
+ python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
+"""
+
+import argparse
+import logging
+import os
+import random
+import subprocess
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import List, Set, Tuple
+
+# Configuration
+OPT_LEVELS = ["O0", "O1", "O2", "O3", "Os", "Oz"]
+DEFAULT_MAX_WORKERS = 100
+
+logger = logging.getLogger(__name__)
+
+
+class TripletResult:
+ """Result from processing a single LLVM IR file"""
+
+ __slots__ = ["triplets", "max_relation"]
+
+ def __init__(self, triplets: Set[str], max_relation: int):
+ self.triplets = triplets
+ self.max_relation = max_relation
+
+
+class IR2VecTripletGenerator:
+ """Main class for generating IR2Vec triplets"""
+
+ def __init__(
+ self,
+ llvm_build_dir: Path,
+ num_optimizations: int,
+ output_dir: Path,
+ max_workers: int = DEFAULT_MAX_WORKERS,
+ ):
+ self.llvm_build_dir = llvm_build_dir
+ self.num_optimizations = num_optimizations
+ self.output_dir = output_dir
+ self.max_workers = max_workers
+
+ # Tool paths
+ self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
+ self.ir2vec_binary = os.path.join(llvm_build_dir, "bin", "llvm-ir2vec")
+
+ self._validate_setup()
+
+ def _validate_setup(self):
+ """Validate that all required tools and paths exist"""
+ if not self.llvm_build_dir.exists():
+ raise FileNotFoundError(
+ f"LLVM build directory not found: {self.llvm_build_dir}"
+ )
+
+ if not os.path.isfile(self.opt_binary) or not os.access(
+ self.opt_binary, os.X_OK
+ ):
+ raise FileNotFoundError(
+ f"opt binary not found or not executable: {self.opt_binary}"
+ )
+
+ if not os.path.isfile(self.ir2vec_binary) or not os.access(
+ self.ir2vec_binary, os.X_OK
+ ):
+ raise FileNotFoundError(
+ f"llvm-ir2vec binary not found or not executable: {self.ir2vec_binary}"
+ )
+
+ if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+ raise ValueError(
+ f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
+ )
+
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+
+ def _select_optimization_levels(self) -> List[str]:
+ """Select unique random optimization levels"""
+ return random.sample(OPT_LEVELS, self.num_optimizations)
+
+ def _process_single_file(self, input_file: Path) -> TripletResult:
+ """Process a single LLVM IR file with multiple optimization levels"""
+ all_triplets = set()
+ max_relation = 1
+ opt_levels = self._select_optimization_levels()
+
+ for opt_level in opt_levels:
+ try:
+ triplets, file_max_relation = self._run_pipeline(input_file, opt_level)
+ if triplets:
+ all_triplets.update(triplets)
+ max_relation = max(max_relation, file_max_relation)
+ logger.debug(
+ f"Generated {len(triplets)} triplets for {input_file} with {opt_level}"
+ )
+ except Exception as e:
----------------
boomanaiden154 wrote:
This code probably shouldn't be in a try catch block at all given you're already catching `CalledProcessError` inside `_run_pipeline`.
https://github.com/llvm/llvm-project/pull/149215
More information about the llvm-branch-commits
mailing list