[llvm] [TySan] A Type Sanitizer (LLVM) (PR #76259)

Florian Mayer via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 6 05:52:15 PST 2024


================
@@ -0,0 +1,868 @@
+//===----- TypeSanitizer.cpp - type-based-aliasing-violation detector -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer, a type-based-aliasing-violation
+// detector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/TypeSanitizer.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tysan"
+
+static const char *const kTysanModuleCtorName = "tysan.module_ctor";
+static const char *const kTysanInitName = "__tysan_init";
+static const char *const kTysanCheckName = "__tysan_check";
+static const char *const kTysanGVNamePrefix = "__tysan_v1_";
+
+static const char *const kTysanShadowMemoryAddress =
+    "__tysan_shadow_memory_address";
+static const char *const kTysanAppMemMask = "__tysan_app_memory_mask";
+
+static cl::opt<bool>
+    ClWritesAlwaysSetType("tysan-writes-always-set-type",
+                          cl::desc("Writes always set the type"), cl::Hidden,
+                          cl::init(false));
+
+STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
+
+static Regex AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N");
+
+namespace {
+
+/// TypeSanitizer: instrument the code in module to find  type-based aliasing
+/// violations.
+struct TypeSanitizer {
+  TypeSanitizer(Module &M);
+  bool run(Function &F, const TargetLibraryInfo &TLI);
+  void instrumentGlobals();
+
+private:
+  typedef SmallDenseMap<const MDNode *, GlobalVariable *, 8>
+      TypeDescriptorsMapTy;
+  typedef SmallDenseMap<const MDNode *, std::string, 8> TypeNameMapTy;
+
+  void initializeCallbacks(Module &M);
+
+  Value *getShadowBase(Function &F);
+  Value *getAppMemMask(Function &F);
+
+  bool instrumentWithShadowUpdate(IRBuilder<> &IRB, const MDNode *TBAAMD,
+                                  Value *Ptr, uint64_t AccessSize, bool IsRead,
+                                  bool IsWrite, Value *&ShadowBase,
+                                  Value *&AppMemMask, bool ForceSetType,
+                                  bool SanitizeFunction,
+                                  TypeDescriptorsMapTy &TypeDescriptors,
+                                  const DataLayout &DL);
+  bool instrumentMemoryAccess(Instruction *I, MemoryLocation &MLoc,
+                              Value *&ShadowBase, Value *&AppMemMask,
+                              bool SanitizeFunction,
+                              TypeDescriptorsMapTy &TypeDescriptors,
+                              const DataLayout &DL);
+  bool instrumentMemInst(Value *I, Value *&ShadowBase, Value *&AppMemMask,
+                         const DataLayout &DL);
+
+  std::string getAnonymousStructIdentifier(const MDNode *MD,
+                                           TypeNameMapTy &TypeNames);
+  bool generateTypeDescriptor(const MDNode *MD,
+                              TypeDescriptorsMapTy &TypeDescriptors,
+                              TypeNameMapTy &TypeNames, Module &M);
+  bool generateBaseTypeDescriptor(const MDNode *MD,
+                                  TypeDescriptorsMapTy &TypeDescriptors,
+                                  TypeNameMapTy &TypeNames, Module &M);
+
+  const Triple TargetTriple;
+  Regex AnonNameRegex;
+  Type *IntptrTy;
+  uint64_t PtrShift;
+  IntegerType *OrdTy;
+
+  // Callbacks to run-time library are computed in doInitialization.
+  Function *TysanCheck;
+  Function *TysanCtorFunction;
+  Function *TysanGlobalsSetTypeFunction;
+};
+} // namespace
+
+TypeSanitizer::TypeSanitizer(Module &M)
+    : TargetTriple(Triple(M.getTargetTriple())),
+      AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N") {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+  PtrShift = countr_zero(IntptrTy->getPrimitiveSizeInBits() / 8);
+
+  TysanGlobalsSetTypeFunction = M.getFunction("__tysan_set_globals_types");
+  initializeCallbacks(M);
+}
+
+void TypeSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  OrdTy = IRB.getInt32Ty();
+
+  AttributeList Attr;
+  Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+  // Initialize the callbacks.
+  TysanCheck = cast<Function>(
+      M.getOrInsertFunction(kTysanCheckName, Attr, IRB.getVoidTy(),
+                            IRB.getPtrTy(), // Pointer to data to be read.
+                            OrdTy,          // Size of the data in bytes.
+                            IRB.getPtrTy(), // Pointer to type descriptor.
+                            OrdTy           // Flags.
+                            )
+          .getCallee());
+
+  TysanCtorFunction = cast<Function>(
+      M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy())
+          .getCallee());
+}
+
+void TypeSanitizer::instrumentGlobals() {
+  Module &M = *TysanCtorFunction->getParent();
+  initializeCallbacks(M);
+  TysanGlobalsSetTypeFunction = nullptr;
+
+  NamedMDNode *Globals = M.getNamedMetadata("llvm.tysan.globals");
+  if (!Globals)
+    return;
+
+  const DataLayout &DL = M.getDataLayout();
+  Value *ShadowBase = nullptr, *AppMemMask = nullptr;
+  TypeDescriptorsMapTy TypeDescriptors;
+  TypeNameMapTy TypeNames;
+
+  for (const auto &GMD : Globals->operands()) {
+    auto *GV = mdconst::dyn_extract_or_null<GlobalVariable>(GMD->getOperand(0));
+    if (!GV)
+      continue;
+    const MDNode *TBAAMD = cast<MDNode>(GMD->getOperand(1));
+    if (!generateBaseTypeDescriptor(TBAAMD, TypeDescriptors, TypeNames, M))
+      continue;
+
+    if (!TysanGlobalsSetTypeFunction) {
+      TysanGlobalsSetTypeFunction = Function::Create(
+          FunctionType::get(Type::getVoidTy(M.getContext()), false),
+          GlobalValue::InternalLinkage, "__tysan_set_globals_types", &M);
+      BasicBlock *BB =
+          BasicBlock::Create(M.getContext(), "", TysanGlobalsSetTypeFunction);
+      ReturnInst::Create(M.getContext(), BB);
+    }
+
+    IRBuilder<> IRB(
+        TysanGlobalsSetTypeFunction->getEntryBlock().getTerminator());
+    Type *AccessTy = GV->getValueType();
+    assert(AccessTy->isSized());
+    uint64_t AccessSize = DL.getTypeStoreSize(AccessTy);
+    instrumentWithShadowUpdate(IRB, TBAAMD, GV, AccessSize, false, false,
+                               ShadowBase, AppMemMask, true, false,
+                               TypeDescriptors, DL);
+  }
+
+  if (TysanGlobalsSetTypeFunction) {
+    IRBuilder<> IRB(TysanCtorFunction->getEntryBlock().getTerminator());
+    IRB.CreateCall(TysanGlobalsSetTypeFunction, {});
+  }
+}
+
+static void insertModuleCtor(Module &M) {
+  Function *TysanCtorFunction;
+  std::tie(TysanCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName,
+                                          kTysanInitName, /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{});
+
+  TypeSanitizer TySan(M);
+  TySan.instrumentGlobals();
+  appendToGlobalCtors(M, TysanCtorFunction, 0);
+}
+
+static const char LUT[] = "0123456789abcdef";
+
+static std::string encodeName(StringRef Name) {
+  size_t Length = Name.size();
+  std::string Output = kTysanGVNamePrefix;
+  Output.reserve(Output.size() + 3 * Length);
+  for (size_t i = 0; i < Length; ++i) {
+    const unsigned char c = Name[i];
+    if (isalnum((int)c)) {
+      Output.push_back(c);
+      continue;
+    }
+
+    if (c == '_') {
+      Output.append("__");
+      continue;
+    }
+
+    Output.push_back('_');
+    Output.push_back(LUT[c >> 4]);
+    Output.push_back(LUT[c & 15]);
+  }
+
+  return Output;
+}
+
+std::string
+TypeSanitizer::getAnonymousStructIdentifier(const MDNode *MD,
+                                            TypeNameMapTy &TypeNames) {
+  MD5 Hash;
+
+  for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+    const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+    if (!MemberNode)
+      return "";
+
+    auto TNI = TypeNames.find(MemberNode);
+    std::string MemberName;
+    if (TNI != TypeNames.end()) {
+      MemberName = TNI->second;
+    } else {
+      if (MemberNode->getNumOperands() < 1)
+        return "";
+      MDString *MemberNameNode = dyn_cast<MDString>(MemberNode->getOperand(0));
+      if (!MemberNameNode)
+        return "";
+      MemberName = MemberNameNode->getString().str();
+      if (MemberName.empty())
+        MemberName = getAnonymousStructIdentifier(MemberNode, TypeNames);
+      if (MemberName.empty())
+        return "";
+      TypeNames[MemberNode] = MemberName;
+    }
+
+    Hash.update(MemberName);
+    Hash.update("\0");
+
+    uint64_t Offset =
+        mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue();
+    Hash.update(utostr(Offset));
+    Hash.update("\0");
+  }
+
+  MD5::MD5Result HashResult;
+  Hash.final(HashResult);
+  return "__anonymous_" + std::string(HashResult.digest().str());
+}
+
+bool TypeSanitizer::generateBaseTypeDescriptor(
+    const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors,
+    TypeNameMapTy &TypeNames, Module &M) {
+  if (MD->getNumOperands() < 1)
+    return false;
+
+  MDString *NameNode = dyn_cast<MDString>(MD->getOperand(0));
+  if (!NameNode)
+    return false;
+
+  std::string Name = NameNode->getString().str();
+  if (Name.empty())
+    Name = getAnonymousStructIdentifier(MD, TypeNames);
+  if (Name.empty())
+    return false;
+  TypeNames[MD] = Name;
+  std::string EncodedName = encodeName(Name);
+
+  GlobalVariable *GV =
+      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName));
+  if (GV) {
+    TypeDescriptors[MD] = GV;
+    return true;
+  }
+
+  SmallVector<std::pair<Constant *, uint64_t>> Members;
+  for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+    const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+    if (!MemberNode)
+      return false;
+
+    Constant *Member;
+    auto TDI = TypeDescriptors.find(MemberNode);
----------------
fmayer wrote:

no strong preference

https://github.com/llvm/llvm-project/pull/76259


More information about the llvm-commits mailing list