[llvm] [TySan] A Type Sanitizer (LLVM) (PR #76259)
Florian Mayer via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 6 04:03:20 PST 2024
================
@@ -0,0 +1,868 @@
+//===----- TypeSanitizer.cpp - type-based-aliasing-violation detector -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer, a type-based-aliasing-violation
+// detector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/TypeSanitizer.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tysan"
+
+static const char *const kTysanModuleCtorName = "tysan.module_ctor";
+static const char *const kTysanInitName = "__tysan_init";
+static const char *const kTysanCheckName = "__tysan_check";
+static const char *const kTysanGVNamePrefix = "__tysan_v1_";
+
+static const char *const kTysanShadowMemoryAddress =
+ "__tysan_shadow_memory_address";
+static const char *const kTysanAppMemMask = "__tysan_app_memory_mask";
+
+static cl::opt<bool>
+ ClWritesAlwaysSetType("tysan-writes-always-set-type",
+ cl::desc("Writes always set the type"), cl::Hidden,
+ cl::init(false));
+
+STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
+
+static Regex AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N");
+
+namespace {
+
+/// TypeSanitizer: instrument the code in module to find type-based aliasing
+/// violations.
+struct TypeSanitizer {
+ TypeSanitizer(Module &M);
+ bool run(Function &F, const TargetLibraryInfo &TLI);
+ void instrumentGlobals();
+
+private:
+ typedef SmallDenseMap<const MDNode *, GlobalVariable *, 8>
+ TypeDescriptorsMapTy;
+ typedef SmallDenseMap<const MDNode *, std::string, 8> TypeNameMapTy;
+
+ void initializeCallbacks(Module &M);
+
+ Value *getShadowBase(Function &F);
+ Value *getAppMemMask(Function &F);
+
+ bool instrumentWithShadowUpdate(IRBuilder<> &IRB, const MDNode *TBAAMD,
+ Value *Ptr, uint64_t AccessSize, bool IsRead,
+ bool IsWrite, Value *&ShadowBase,
+ Value *&AppMemMask, bool ForceSetType,
+ bool SanitizeFunction,
+ TypeDescriptorsMapTy &TypeDescriptors,
+ const DataLayout &DL);
+ bool instrumentMemoryAccess(Instruction *I, MemoryLocation &MLoc,
+ Value *&ShadowBase, Value *&AppMemMask,
+ bool SanitizeFunction,
+ TypeDescriptorsMapTy &TypeDescriptors,
+ const DataLayout &DL);
+ bool instrumentMemInst(Value *I, Value *&ShadowBase, Value *&AppMemMask,
+ const DataLayout &DL);
+
+ std::string getAnonymousStructIdentifier(const MDNode *MD,
+ TypeNameMapTy &TypeNames);
+ bool generateTypeDescriptor(const MDNode *MD,
+ TypeDescriptorsMapTy &TypeDescriptors,
+ TypeNameMapTy &TypeNames, Module &M);
+ bool generateBaseTypeDescriptor(const MDNode *MD,
+ TypeDescriptorsMapTy &TypeDescriptors,
+ TypeNameMapTy &TypeNames, Module &M);
+
+ const Triple TargetTriple;
+ Regex AnonNameRegex;
+ Type *IntptrTy;
+ uint64_t PtrShift;
+ IntegerType *OrdTy;
+
+ // Callbacks to run-time library are computed in doInitialization.
+ Function *TysanCheck;
+ Function *TysanCtorFunction;
+ Function *TysanGlobalsSetTypeFunction;
+};
+} // namespace
+
+TypeSanitizer::TypeSanitizer(Module &M)
+ : TargetTriple(Triple(M.getTargetTriple())),
+ AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N") {
+ const DataLayout &DL = M.getDataLayout();
+ IntptrTy = DL.getIntPtrType(M.getContext());
+ PtrShift = countr_zero(IntptrTy->getPrimitiveSizeInBits() / 8);
+
+ TysanGlobalsSetTypeFunction = M.getFunction("__tysan_set_globals_types");
+ initializeCallbacks(M);
+}
+
+void TypeSanitizer::initializeCallbacks(Module &M) {
+ IRBuilder<> IRB(M.getContext());
+ OrdTy = IRB.getInt32Ty();
+
+ AttributeList Attr;
+ Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+ // Initialize the callbacks.
+ TysanCheck = cast<Function>(
+ M.getOrInsertFunction(kTysanCheckName, Attr, IRB.getVoidTy(),
+ IRB.getPtrTy(), // Pointer to data to be read.
+ OrdTy, // Size of the data in bytes.
+ IRB.getPtrTy(), // Pointer to type descriptor.
+ OrdTy // Flags.
+ )
+ .getCallee());
+
+ TysanCtorFunction = cast<Function>(
+ M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy())
+ .getCallee());
+}
+
+void TypeSanitizer::instrumentGlobals() {
+ Module &M = *TysanCtorFunction->getParent();
+ initializeCallbacks(M);
+ TysanGlobalsSetTypeFunction = nullptr;
+
+ NamedMDNode *Globals = M.getNamedMetadata("llvm.tysan.globals");
+ if (!Globals)
+ return;
+
+ const DataLayout &DL = M.getDataLayout();
+ Value *ShadowBase = nullptr, *AppMemMask = nullptr;
+ TypeDescriptorsMapTy TypeDescriptors;
+ TypeNameMapTy TypeNames;
+
+ for (const auto &GMD : Globals->operands()) {
+ auto *GV = mdconst::dyn_extract_or_null<GlobalVariable>(GMD->getOperand(0));
+ if (!GV)
+ continue;
+ const MDNode *TBAAMD = cast<MDNode>(GMD->getOperand(1));
+ if (!generateBaseTypeDescriptor(TBAAMD, TypeDescriptors, TypeNames, M))
+ continue;
+
+ if (!TysanGlobalsSetTypeFunction) {
+ TysanGlobalsSetTypeFunction = Function::Create(
+ FunctionType::get(Type::getVoidTy(M.getContext()), false),
+ GlobalValue::InternalLinkage, "__tysan_set_globals_types", &M);
+ BasicBlock *BB =
+ BasicBlock::Create(M.getContext(), "", TysanGlobalsSetTypeFunction);
+ ReturnInst::Create(M.getContext(), BB);
+ }
+
+ IRBuilder<> IRB(
+ TysanGlobalsSetTypeFunction->getEntryBlock().getTerminator());
+ Type *AccessTy = GV->getValueType();
+ assert(AccessTy->isSized());
+ uint64_t AccessSize = DL.getTypeStoreSize(AccessTy);
+ instrumentWithShadowUpdate(IRB, TBAAMD, GV, AccessSize, false, false,
+ ShadowBase, AppMemMask, true, false,
+ TypeDescriptors, DL);
+ }
+
+ if (TysanGlobalsSetTypeFunction) {
+ IRBuilder<> IRB(TysanCtorFunction->getEntryBlock().getTerminator());
+ IRB.CreateCall(TysanGlobalsSetTypeFunction, {});
+ }
+}
+
+static void insertModuleCtor(Module &M) {
+ Function *TysanCtorFunction;
+ std::tie(TysanCtorFunction, std::ignore) =
+ createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName,
+ kTysanInitName, /*InitArgTypes=*/{},
+ /*InitArgs=*/{});
+
+ TypeSanitizer TySan(M);
+ TySan.instrumentGlobals();
+ appendToGlobalCtors(M, TysanCtorFunction, 0);
+}
+
+static const char LUT[] = "0123456789abcdef";
+
+static std::string encodeName(StringRef Name) {
+ size_t Length = Name.size();
+ std::string Output = kTysanGVNamePrefix;
+ Output.reserve(Output.size() + 3 * Length);
+ for (size_t i = 0; i < Length; ++i) {
+ const unsigned char c = Name[i];
+ if (isalnum((int)c)) {
+ Output.push_back(c);
+ continue;
+ }
+
+ if (c == '_') {
+ Output.append("__");
+ continue;
+ }
+
+ Output.push_back('_');
+ Output.push_back(LUT[c >> 4]);
+ Output.push_back(LUT[c & 15]);
+ }
+
+ return Output;
+}
+
+std::string
+TypeSanitizer::getAnonymousStructIdentifier(const MDNode *MD,
+ TypeNameMapTy &TypeNames) {
+ MD5 Hash;
+
+ for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+ const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+ if (!MemberNode)
+ return "";
+
+ auto TNI = TypeNames.find(MemberNode);
+ std::string MemberName;
+ if (TNI != TypeNames.end()) {
+ MemberName = TNI->second;
+ } else {
+ if (MemberNode->getNumOperands() < 1)
+ return "";
+ MDString *MemberNameNode = dyn_cast<MDString>(MemberNode->getOperand(0));
+ if (!MemberNameNode)
+ return "";
+ MemberName = MemberNameNode->getString().str();
+ if (MemberName.empty())
+ MemberName = getAnonymousStructIdentifier(MemberNode, TypeNames);
+ if (MemberName.empty())
+ return "";
+ TypeNames[MemberNode] = MemberName;
+ }
+
+ Hash.update(MemberName);
+ Hash.update("\0");
+
+ uint64_t Offset =
+ mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue();
+ Hash.update(utostr(Offset));
+ Hash.update("\0");
+ }
+
+ MD5::MD5Result HashResult;
+ Hash.final(HashResult);
+ return "__anonymous_" + std::string(HashResult.digest().str());
+}
+
+bool TypeSanitizer::generateBaseTypeDescriptor(
+ const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors,
+ TypeNameMapTy &TypeNames, Module &M) {
+ if (MD->getNumOperands() < 1)
+ return false;
+
+ MDString *NameNode = dyn_cast<MDString>(MD->getOperand(0));
+ if (!NameNode)
+ return false;
+
+ std::string Name = NameNode->getString().str();
+ if (Name.empty())
+ Name = getAnonymousStructIdentifier(MD, TypeNames);
+ if (Name.empty())
+ return false;
+ TypeNames[MD] = Name;
+ std::string EncodedName = encodeName(Name);
+
+ GlobalVariable *GV =
+ dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName));
+ if (GV) {
+ TypeDescriptors[MD] = GV;
+ return true;
+ }
+
+ SmallVector<std::pair<Constant *, uint64_t>> Members;
+ for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+ const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+ if (!MemberNode)
+ return false;
+
+ Constant *Member;
+ auto TDI = TypeDescriptors.find(MemberNode);
----------------
fmayer wrote:
(also in other places)
https://github.com/llvm/llvm-project/pull/76259
More information about the llvm-commits
mailing list