[llvm] [IR2Vec] Initial infrastructure for MIR2Vec (PR #161463)

Mircea Trofin via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 1 17:51:57 PDT 2025


================
@@ -0,0 +1,201 @@
+//===- MIR2VecTest.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace mir2vec;
+using VocabMap = std::map<std::string, ir2vec::Embedding>;
+
+namespace {
+
+TEST(MIR2VecTest, RegexExtraction) {
+  // Test simple instruction names
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("NOP"), "NOP");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("RET"), "RET");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("ADD16ri"), "ADD");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("ADD32rr"), "ADD");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("ADD64rm"), "ADD");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("MOV8ri"), "MOV");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("MOV32mr"), "MOV");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("PUSH64r"), "PUSH");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("POP64r"), "POP");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("JMP_4"), "JMP");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("CALL64pcrel32"), "CALL");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("SOME_INSTR_123"),
+            "SOME_INSTR");
+  EXPECT_EQ(MIRVocabulary::extractBaseOpcodeName("123ADD"), "ADD");
+  EXPECT_FALSE(MIRVocabulary::extractBaseOpcodeName("123").empty());
+}
+
+class MIR2VecVocabTestFixture : public ::testing::Test {
+protected:
+  std::unique_ptr<LLVMContext> Ctx;
+  std::unique_ptr<Module> M;
+  std::unique_ptr<TargetMachine> TM;
+  const TargetInstrInfo *TII;
+
+  void SetUp() override {
+    LLVMInitializeX86TargetInfo();
+    LLVMInitializeX86Target();
+    LLVMInitializeX86TargetMC();
+
+    Ctx = std::make_unique<LLVMContext>();
+    M = std::make_unique<Module>("test", *Ctx);
+
+    // Set up X86 target
+    Triple TargetTriple("x86_64-unknown-linux-gnu");
+    M->setTargetTriple(TargetTriple);
+
+    std::string Error;
+    const Target *TheTarget =
+        TargetRegistry::lookupTarget(M->getTargetTriple(), Error);
+    ASSERT_TRUE(TheTarget) << "Failed to lookup target: " << Error;
+
+    TargetOptions Options;
+    TM = std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
+        M->getTargetTriple(), "", "", Options, Reloc::Model::Static));
+    ASSERT_TRUE(TM);
+
+    // Create a dummy function to get subtarget info
+    FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F =
+        Function::Create(FT, Function::ExternalLinkage, "test", M.get());
+
+    // Get the target instruction info
+    TII = TM->getSubtargetImpl(*F)->getInstrInfo();
+    ASSERT_TRUE(TII);
+  }
+
+  void TearDown() override {
+    TM.reset();
+    M.reset();
+    Ctx.reset();
+  }
+};
+
+TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
+  // Test that same base opcodes get same canonical indices
+  std::string baseName1 = MIRVocabulary::extractBaseOpcodeName("ADD16ri");
----------------
mtrofin wrote:

names? BaseName1 (capitalization)

same further down in a few places.

https://github.com/llvm/llvm-project/pull/161463


More information about the llvm-commits mailing list